[Model] Use the same fused_moe configs for all H200 devices (#23642 )

Signed-off-by: Roger Meier <r.meier@siemens.com>
[MTP] Refactor mtp predictor to avoid d2h operation (#27643 )
2025-10-30 17:36:56 +00:00 · 2025-10-30 17:27:39 +00:00 · 2025-10-30 10:22:30 -07:00 · 2025-10-31 00:33:13 +08:00 · 2025-10-30 11:41:44 -04:00 · 2025-10-30 08:24:31 -07:00
1106 changed files with 50786 additions and 19783 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
    value: 0.80
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,10 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 tasks:
 - name: "mmlu_pro"
  metrics:
  - name: "exact_match,custom-extract"
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.855
 limit: 2500
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -0,0 +1,14 @@
 model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
 tasks:
  - name: "mmlu_pro"
    metrics:
      - name: "exact_match,custom-extract"
        value: 0.82
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
 enforce_eager: false # we use false to speed up the eval process
 kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
 max_model_len: 40960
 apply_chat_template: true
 fewshot_as_multiturn: true
 gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -0,0 +1 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
 Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.9
 usage() {
    echo``
    echo "Runs lm eval harness on ChartQA using multimodal vllm."
    echo "This pathway is intended to be used to create baselines for "
    echo "our correctness tests in vllm's CI."
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:l:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    t ) 
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm-vlm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on MMLUPRO for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
    f )
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,21 +19,35 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
    enforce_eager = eval_config.get("enforce_eager", "true")
    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager=true,"
+        f"enforce_eager={enforce_eager},"
        f"kv_cache_dtype={kv_cache_dtype},"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
    )
    results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
        # text models. however, this is regressing measured strict-match for
        # existing text models in CI, so only apply it for mm, or explicitly set
        apply_chat_template=eval_config.get(
            "apply_chat_template", backend == "vllm-vlm"
        ),
        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
        gen_kwargs=eval_config.get("gen_kwargs"),
        batch_size=batch_size,
    )
    return results
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,184 +0,0 @@
 steps:
  - label: "Wait for container to be ready"
    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
    - kubernetes:
        podSpec:
          containers:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - label: "Cleanup H100"
    agents:
      queue: H100
    depends_on: ~
    command: docker system prune -a --volumes --force
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  # Premerge benchmark
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,28 +0,0 @@
 # Nightly benchmark annotation
 ## Description
 This file contains the downloading link for benchmarking results.
 - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 - [benchmarking results](artifact://results.zip)
 - [benchmarking code](artifact://nightly-benchmarks.zip)
 Please download the visualization scripts in the post
 ## Results reproduction
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
    - Download `nightly-benchmarks.zip`.
    - In the same folder, run the following code:
    ```bash
    export HF_TOKEN=<your HF token>
    apt update
    apt install -y git
    unzip nightly-benchmarks.zip
    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
    ```
 And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -1,39 +0,0 @@
 # Nightly benchmark
 This benchmark aims to:
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 ## Setup
 - Docker images:
    - vLLM: `vllm/vllm-openai:v0.6.2`
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
    - 8x Nvidia A100 GPUs
 - Workload:
    - Dataset
        - ShareGPT dataset
        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
    - Models: llama-3 8B, llama-3 70B.
        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 ## Known issues
 - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
 - TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,196 +0,0 @@
 common_pod_spec: &common_pod_spec
  priorityClassName: perf-benchmark
  nodeSelector:
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
  volumes:
    - name: devshm
      emptyDir:
        medium: Memory
    - name: hf-cache
      hostPath:
        path: /root/.cache/huggingface
        type: Directory
 common_container_settings: &common_container_settings
  command:
    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
  volumeMounts:
    - name: devshm
      mountPath: /dev/shm
    - name: hf-cache
      mountPath: /root/.cache/huggingface
  env:
    - name: VLLM_USAGE_SOURCE
      value: ci-test
    - name: HF_HOME
      value: /root/.cache/huggingface
    - name: VLLM_SOURCE_CODE_LOC
      value: /workspace/build/buildkite/vllm/performance-benchmark
    - name: HF_TOKEN
      valueFrom:
        secretKeyRef:
          name: hf-token-secret
          key: token
 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: vllm/vllm-openai:v0.6.2
                <<: *common_container_settings
  - label: "A100 sglang benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings
  - label: "A100 lmdeploy benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings
  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama8B"
  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama70B"
  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
  # - label: "A100 trt benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
  #               <<: *common_container_settings
  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
  # - label: "A100 tgi benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
  #               <<: *common_container_settings
  - wait
  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
            - image: vllm/vllm-openai:v0.5.0.post1
              command:
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
              resources:
                limits:
                  nvidia.com/gpu: 8
              volumeMounts:
              - name: devshm
                mountPath: /dev/shm
              env:
              - name: VLLM_USAGE_SOURCE
                value: ci-test
              - name: VLLM_SOURCE_CODE_LOC
                value: /workspace/build/buildkite/vllm/performance-benchmark
              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
                    key: token
  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,26 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 from transformers import AutoTokenizer
 def main(model, cachedir):
    # Load the tokenizer and save it to the specified directory
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.save_pretrained(cachedir)
    print(f"Tokenizer saved to {cachedir}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download and save Hugging Face tokenizer"
    )
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
    parser.add_argument(
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
    )
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,97 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Parse command line arguments for summary-nightly-results script."
    )
    parser.add_argument(
        "--results-folder",
        type=str,
        required=True,
        help="The folder where the results are stored.",
    )
    parser.add_argument(
        "--description", type=str, required=True, help="Description of the results."
    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
        target = df["Test name"].str.contains(model)
        target = target & df["Engine"].str.contains(method)
        target = target & df["Test name"].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
            means.append(0.0)
        else:
            means.append(filtered_df[metric].values[0])
    return np.array(means)
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
        std = get_perf(df, method, model, "Std " + metric + " (ms)")
        if std.mean() == 0:
            std = None
        success = get_perf(df, method, model, "Successful req.")
        if std is not None:
            std = std / np.sqrt(success)
            std = std.tolist()
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
            df, method, model, "Output Tput (tok/s)"
        )
        mean = mean.tolist()
        std = None
    return mean, std
 def main(args):
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
    with open(args.description) as f:
        description = f.read()
    description = description.format(nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
 if __name__ == "__main__":
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,9 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")
 model_name = api_client.available_models[0]
 print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -1,78 +0,0 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 main() {
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    (which zip) || (apt-get install -y zip)
    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
        exit 0
    fi
    # initial annotation
    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
    # download results
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/
    # upload benchmark results
    zip -r results.zip results/
    /workspace/buildkite-agent artifact upload "results.zip"
    # upload benchmarking scripts
    cd "$VLLM_SOURCE_CODE_LOC/"
    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    # upload benchmarking pipeline
    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
    # The figures should be generated by a separate process outside the CI/CD pipeline
    # # generate figures
    # python3 -m pip install tabulate pandas matplotlib
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
    #     --description $description \
    #     --results-folder results/ 
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sharegpt
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_2048_128
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_128_2048
    # # upload results and figures
    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -1,464 +0,0 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 get_current_llm_serving_engine() {
  if which lmdeploy >/dev/null; then
    echo "Container: lmdeploy"
    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
    return
  fi
  if [ -e /tgi-entrypoint.sh ]; then
    echo "Container: tgi"
    export CURRENT_LLM_SERVING_ENGINE=tgi
    return
  fi
  if which trtllm-build >/dev/null; then
    echo "Container: tensorrt-llm"
    export CURRENT_LLM_SERVING_ENGINE=trt
    return
  fi
  if [ -e /sgl-workspace ]; then
    echo "Container: sglang"
    export CURRENT_LLM_SERVING_ENGINE=sglang
    return
  fi
  if [ -e /vllm-workspace ]; then
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
    return
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 kill_gpu_processes() {
  pkill -f '[p]ython'
  pkill -f '[p]ython3'
  pkill -f '[t]ritonserver'
  pkill -f '[p]t_main_thread'
  pkill -f '[t]ext-generation'
  pkill -f '[l]mdeploy'
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pkill -f '[V]LLM'
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 ensure_installed() {
  # Ensure that the given command is installed by apt-get
  local cmd=$1
  if ! which "$cmd" >/dev/null; then
    apt-get update && apt-get install -y "$cmd"
  fi
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    # this is required for lmdeploy.
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    # change model name for lmdeploy (it will not follow standard hf name)
    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ $backend = "trt" ]]; then
        backend="tensorrt-llm"
      fi
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      if [[ "$dataset_name" = "sharegpt" ]]; then
        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      elif [[ "$dataset_name" = "sonnet" ]]; then
        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --sonnet-input-len $sonnet_input_len \
          --sonnet-output-len $sonnet_output_len \
          --sonnet-prefix-len $sonnet_prefix_len \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      else
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command="None"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
  done
  kill_gpu_processes
 }
 run_genai_perf_tests() {
  # run genai-perf tests
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
  genai_perf_test_file=$1
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps=$num_prompts
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      #TODO: add output dir.
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
        --backend "$backend" \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
        --request-rate $qps \
        --num-prompts $num_prompts \
      "
    echo "Client command: $client_command"
    eval "$client_command"
    #TODO: process/record outputs
    done
  done
  kill_gpu_processes
 }
 prepare_dataset() {
  # download sharegpt dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  echo "" > sonnet_4x.txt
  for _ in {1..4}
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
 }
 main() {
  # check if the environment variable is successfully injected from yaml
  check_gpus
  check_hf_token
  get_current_llm_serving_engine
  pip install -U transformers
  pip install -r requirements/dev.txt
  which genai-perf
  # check storage
  df -h
  ensure_installed wget
  ensure_installed curl
  ensure_installed jq
  # genai-perf dependency
  ensure_installed libb64-0d
  prepare_dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
  # run the test
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
  # run genai-perf tests
  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
  mv artifacts/ $RESULTS_FOLDER/
  # upload benchmark results to buildkite
  python3 -m pip install tabulate pandas
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,82 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 results_folder = Path("results/")
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "completed": "Successful req.",
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "std_tpot_ms": "Std TPOT (ms)",
    "median_tpot_ms": "Median TPOT (ms)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "total_input_tokens": "Total input tokens",
    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)
        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})
        # add the result to raw_result
        serving_results.append(raw_result)
        continue
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    serving_md_table_with_headers = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    # remove the first line of header
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
    # document benchmarking results in markdown
    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
        f.write("\n")
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
        results = serving_results.to_dict(orient="records")
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,23 +0,0 @@
 #!/bin/sh
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 else
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 fi
 TIMEOUT_SECONDS=10
 retries=0
 while [ $retries -lt 1000 ]; do
    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi
    echo "Waiting for image to be available..."
    retries=$((retries + 1))
    sleep 5
 done
 exit 1
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -1,30 +0,0 @@
 [
    {
        "test_name": "latency_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
    {
        "test_name": "latency_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -1,32 +0,0 @@
 [
    {
        "test_name": "throughput_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -2,40 +2,23 @@
 ## Introduction
-This directory contains two sets of benchmark for vllm.
+This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
-
+vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
 **Benchmarking Duration**: about 1hr.
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 ## Nightly benchmark quick overview
 **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
 **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
 **Benchmarking Duration**: about 3.5hrs.
 ## Trigger the benchmark
-Performance benchmark will be triggered when:
+The benchmark needs to be triggered manually:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 Manually Trigger the benchmark
 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 Runtime environment variables:
@@ -47,10 +30,6 @@ Runtime environment variables:
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
@@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model,
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 ### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 - Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
 - The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
 - At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 ### Nightly tests
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 ### Docker containers
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,6 +7,7 @@ from importlib import util
 import pandas as pd
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -109,7 +110,10 @@ def compare_data_columns(
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
-            ratio = current / base
+            if "P99" in data_column or "Median" in data_column:
                ratio = base / current  # for latency
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
@@ -199,6 +203,71 @@ def split_json_by_tp_pp(
    return saved_paths
 def _add_limit_line(fig, y_value, label):
    # Visible dashed line + annotation
    fig.add_hline(
        y=y_value,
        line_dash="dash",
        line_color="red" if "ttft" in label.lower() else "blue",
        annotation_text=f"{label}: {y_value} ms",
        annotation_position="top left",
    )
    # Optional: add a legend item (as a transparent helper trace)
    if plot and plotly_found:
        import plotly.graph_objects as go
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="lines",
                line=dict(
                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
                ),
                name=f"{label}",
            )
        )
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    # Fallback: guess an integer-like column (harmless if unused)
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
    return "# of max concurrency."
 def _highlight_threshold(
    df: pd.DataFrame, threshold: float
 ) -> "pd.io.formats.style.Styler":
    """Highlight numeric per-configuration columns with value <= threshold."""
    conc_col = _find_concurrency_col(df)
    key_cols = [
        c
        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
        if c in df.columns
    ]
    conf_cols = [
        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
    return df.style.map(
        lambda v: "background-color:#e6ffe6;font-weight:bold;"
        if pd.notna(v) and v <= threshold
        else "",
        subset=conf_cols,
    )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
@@ -220,6 +289,26 @@ if __name__ == "__main__":
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
    parser.add_argument(
        "-l",
        "--latency",
        type=str,
        default="p99",
        help="take median|p99 for latency like TTFT/TPOT",
    )
    parser.add_argument(
        "--ttft-max-ms",
        type=float,
        default=3000.0,
        help="Reference limit for TTFT plots (ms)",
    )
    parser.add_argument(
        "--tpot-max-ms",
        type=float,
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
    args = parser.parse_args()
    drop_column = "P99"
@@ -234,12 +323,22 @@ if __name__ == "__main__":
        "# of max concurrency.",
        "qps",
    ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+
-    html_msgs_for_data_cols = [
+    if "median" in args.latency:
-        "Compare Output Tokens /n",
+        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-        "Median TTFT /n",
+        html_msgs_for_data_cols = [
-        "Median TPOT /n",
+            "Compare Output Tokens /n",
-    ]
+            "Median TTFT /n",
            "Median TPOT /n",
        ]
        drop_column = "P99"
    elif "p99" in args.latency:
        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
        html_msgs_for_data_cols = [
            "Compare Output Tokens /n",
            "P99 TTFT /n",
            "P99 TPOT /n",
        ]
    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
@@ -275,33 +374,83 @@ if __name__ == "__main__":
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
            output_df_sorted = output_df.sort_values(by=args.xaxis)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
-                html = group.to_html()
+                group_name = (
                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
                )
                group_html_name = "perf_comparison_" + group_name + ".html"
                metric_name = str(data_cols_to_compare[i]).lower()
                if "tok/s" in metric_name:
                    html = group.to_html()
                elif "ttft" in metric_name:
                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
                        na_rep="—",
                    )
                    html = styler.to_html(
                        table_attributes='border="1" class="dataframe"'
                    )
                elif (
                    "tpot" in metric_name
                    or "median" in metric_name
                    or "p99" in metric_name
                ):
                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
                        na_rep="—",
                    )
                    html = styler.to_html(
                        table_attributes='border="1" class="dataframe"'
                    )
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
                with open(group_html_name, "a+") as sub_text_file:
                    sub_text_file.write(html_msgs_for_data_cols[i])
                    sub_text_file.write(html)
-                if plot and plotly_found:
+                    if plot and plotly_found:
-                    import plotly.express as px
+                        import plotly.express as px
-                    df = group[raw_data_cols]
+                        df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
+                        # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
+                        df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
+                            id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
+                            var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
+                            value_name=data_cols_to_compare[i],
-                    )
+                        )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        title = (
-                    # Create Plotly line chart
+                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    fig = px.line(
+                        )
-                        df_melted,
+                        # Create Plotly line chart
-                        x=info_cols[y_axis_index],
+                        fig = px.line(
-                        y=data_cols_to_compare[i],
+                            df_melted,
-                        color="Configuration",
+                            x=info_cols[y_axis_index],
-                        title=title,
+                            y=data_cols_to_compare[i],
-                        markers=True,
+                            color="Configuration",
-                    )
+                            title=title,
-                    # Export to HTML
+                            markers=True,
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                        )
                        # ---- Add threshold lines based on metric name ----
                        if "ttft" in metric_name:
                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
                        elif (
                            "tpot" in metric_name
                            or "median" in metric_name
                            or "p99" in metric_name
                        ):
                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
                        # Export to HTML
                        text_file.write(
                            fig.to_html(full_html=True, include_plotlyjs="cdn")
                        )
                        sub_text_file.write(
                            fig.to_html(full_html=True, include_plotlyjs="cdn")
                        )
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -63,9 +63,11 @@ serving_column_mapping = {
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
    "std_ttft_ms": "STD TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
    "std_tpot_ms": "STD TPOT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@@ -368,7 +370,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
        )
    # get markdown tables
@@ -390,7 +392,7 @@ if __name__ == "__main__":
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/"
+            "../.buildkite/performance-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -469,7 +469,12 @@ main() {
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@@ -0,0 +1,26 @@
 [
    {
        "test_name": "latency_llama8B_tp2",
        "environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -95,6 +95,38 @@
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@@ -233,6 +265,41 @@
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@@ -365,6 +432,38 @@
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@@ -503,6 +602,41 @@
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@@ -638,6 +772,39 @@
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@@ -780,6 +947,42 @@
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -2,7 +2,7 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -28,13 +28,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -60,13 +60,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_tp1_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -76,39 +76,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -124,16 +92,16 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -143,7 +111,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
+            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -159,10 +127,150 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@@ -0,0 +1,27 @@
 [
    {
        "test_name": "throughput_llama8B_tp2",
        "environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
@@ -15,6 +15,21 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  # aarch64 build
  - label: "Build arm64 CPU wheel"
    depends_on: ~
    id: build-wheel-arm64-cpu
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
@@ -28,20 +43,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
    depends_on: ~
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@@ -55,6 +56,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 13.0"
    depends_on: ~
    id: build-wheel-cuda-13-0
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
@@ -62,13 +77,12 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
@@ -142,6 +156,22 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build arm64 CPU release image"
    key: block-arm64-cpu-release-image-build
    depends_on: ~
  - label: "Build and publish arm64 CPU release image"
    depends_on: block-arm64-cpu-release-image-build
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -70,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -20,7 +20,10 @@ trap remove_docker_container EXIT
 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri \
+    --device /dev/dri:/dev/dri \
    --net=host \
    --ipc=host \
    --privileged \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
@@ -42,7 +45,7 @@ docker run \
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu129"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
    echo "Skipping index files for non-cu129 wheels"
 fi
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu129"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 else
    echo "Skipping index files for non-cu129 wheels"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@@ -50,7 +50,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 36min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -63,7 +63,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -286,7 +286,7 @@ steps:
 - label: Engine Test # 25min
  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  #grade: Blocking
  source_file_dependencies:
@@ -318,7 +318,7 @@ steps:
 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -353,7 +353,7 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -395,7 +395,9 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@@ -436,7 +438,11 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
      --ignore=lora/test_gptoss.py \
      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests # 15min
@@ -454,11 +460,12 @@ steps:
    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
+  #  - pytest -v -s compile/test_sequence_parallelism.py
-    - pytest -v -s compile/test_async_tp.py
+  #  - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
    - pytest -v -s compile/test_aot_compile.py
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -473,8 +480,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@@ -484,17 +491,19 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
  - pytest -v -s compile/test_fusions_e2e.py
 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@@ -552,7 +561,7 @@ steps:
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -603,8 +612,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - uv pip install --system torchao==0.13.0
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@@ -631,7 +641,7 @@ steps:
 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  fast_check: false
@@ -779,8 +789,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    # Install fast path packages for testing against transformers
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
@@ -846,6 +858,18 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
@@ -884,7 +908,7 @@ steps:
 - label: Quantized Models Test # 45 min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -921,8 +945,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
+- label: Blackwell Test # 21 min
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@@ -935,8 +959,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@@ -953,13 +975,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 - label: Blackwell Fusion Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -1079,6 +1120,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -1126,6 +1168,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@@ -1169,7 +1216,7 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-
+    - pytest -v -s -x lora/test_olmoe_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
@@ -1199,6 +1246,18 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1230,12 +1289,16 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -172,6 +172,8 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@@ -203,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
  gpu: h100
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@@ -311,6 +331,15 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: V1 Test attention (H100) # 10min
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
@@ -349,7 +378,8 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@@ -384,7 +414,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
      --ignore=lora/test_gptoss.py \
      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests # 15min
@@ -416,8 +451,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@@ -425,6 +460,19 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
  - pytest -v -s compile/test_fusions_e2e.py
 - label: Cudagraph test
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/v1/cudagraph
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - vllm/compilation
  commands:
    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@@ -468,6 +516,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  - vllm/envs.py
  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@@ -527,8 +577,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@@ -677,8 +728,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    # Install fast path packages for testing against transformers
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
@@ -733,6 +786,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@@ -796,8 +859,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
+- label: Blackwell Test # 21 min
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@@ -810,8 +873,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@@ -828,15 +889,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 - label: Blackwell Fusion Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -943,6 +1021,8 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@@ -950,6 +1030,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -993,6 +1074,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@@ -1032,6 +1118,7 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1058,6 +1145,17 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1088,8 +1186,21 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 ##### H100 test #####
 - label: LM Eval Large Models (H100) # optional
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
@@ -1097,6 +1208,8 @@ steps:
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,10 +5,8 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
-/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
@@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC
 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -108,7 +108,7 @@ pull_request_rules:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
-      - files~=^\.buildkite/nightly-benchmarks/
+      - files~=^\.buildkite/performance-benchmarks/
  actions:
    label:
      add:
--- a/.gitignore
+++ b/.gitignore
@@ -94,6 +94,9 @@ ipython_config.py
 # generated files
 **/generated/**
 # uv
 uv.lock
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
 MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,18 +38,18 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
-    name: Run mypy for local Python installation
+    name: Run mypy locally for lowest supported Python version
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    entry: python tools/pre_commit/mypy.py 0 "3.10"
    stages: [pre-commit] # Don't run in CI
    <<: &mypy_common
      language: python
@@ -78,12 +78,12 @@ repos:
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@@ -100,12 +100,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@@ -119,11 +119,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -131,7 +131,7 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -144,7 +144,7 @@ repos:
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -5,7 +5,7 @@ import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -46,7 +46,7 @@ import time
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -19,7 +19,7 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -11,7 +11,7 @@ import time
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 # Select a equi-probable random priority
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from contextlib import nullcontext
 from dataclasses import dataclass
 import datasets
@@ -50,7 +51,7 @@ except ImportError:
    from backend_request_func import get_tokenizer
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
@@ -501,15 +502,9 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-    # This can be used once the minimum Python version is 3.10 or higher,
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_triton_block_scaled_mm,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.math_utils import cdiv
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 def with_triton_mode(fn):
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -28,7 +28,7 @@ except ImportError as e:
 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 WEIGHT_SHAPES_MOE = {
    "nvidia/DeepSeek-R1-FP4": [
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 logger = init_logger(__name__)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 DEFAULT_MODELS = [
    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -7,7 +7,8 @@ import torch
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -25,7 +25,7 @@ if HAS_TRITON:
    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    sort_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 )
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -39,7 +39,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -9,9 +9,9 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -1,155 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import torch
 from vllm import _custom_ops as vllm_ops
 from vllm.triton_utils import triton
 def polynorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    def norm(x, eps: float):
        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
    x = x.float()
    return (
        (
            weight[0] * norm(x**3, eps)
            + weight[1] * norm(x**2, eps)
            + weight[2] * norm(x, eps)
            + bias
        )
        .to(weight.dtype)
        .view(orig_shape)
    )
 def polynorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    out = torch.empty_like(x)
    vllm_ops.poly_norm(out, x, weight, bias, eps)
    output = out
    output = output.view(orig_shape)
    return output
 def calculate_diff(batch_size, seq_len, hidden_dim):
    dtype = torch.bfloat16
    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
    weight = torch.ones(3, dtype=dtype, device="cuda")
    bias = torch.ones(1, dtype=dtype, device="cuda")
    output_naive = polynorm_naive(x, weight, bias)
    output_vllm = polynorm_vllm(x, weight, bias)
    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
        print("✅ All implementations match")
    else:
        print("❌ Implementations differ")
 batch_size_range = [2**i for i in range(0, 7, 2)]
 seq_length_range = [2**i for i in range(6, 11, 1)]
 dim_range = [2048, 4096]
 configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
 def get_benchmark():
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["dim", "batch_size", "seq_len"],
            x_vals=[list(_) for _ in configs],
            line_arg="provider",
            line_vals=["naive", "vllm"],
            line_names=["Naive", "vLLM"],
            styles=[("blue", "-"), ("red", "-")],
            ylabel="us",
            plot_name="polynorm-perf",
            args={},
        )
    )
    def benchmark(dim, batch_size, seq_len, provider):
        dtype = torch.bfloat16
        hidden_dim = dim * 4
        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
        weight = torch.ones(3, dtype=dtype, device="cuda")
        bias = torch.ones(1, dtype=dtype, device="cuda")
        quantiles = [0.5, 0.2, 0.8]
        if provider == "naive":
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_naive(x, weight, bias),
                quantiles=quantiles,
            )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_vllm(x, weight, bias),
                quantiles=quantiles,
            )
        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
    return benchmark
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--batch-size",
        type=int,
        default=4,
        help="Batch size",
    )
    parser.add_argument(
        "--seq-len",
        type=int,
        default=128,
        help="Sequence length",
    )
    parser.add_argument(
        "--hidden-dim",
        type=int,
        default=8192,
        help="Intermediate size of MLP",
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default="./configs/polnorm/",
        help="Path to save polnorm benchmark results",
    )
    args = parser.parse_args()
    # Run correctness test
    calculate_diff(
        batch_size=args.batch_size,
        seq_len=args.seq_len,
        hidden_dim=args.hidden_dim,
    )
    benchmark = get_benchmark()
    # Run performance benchmark
    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -7,7 +7,8 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -9,9 +9,9 @@ from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random_flash,
 )
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -8,7 +8,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 def benchmark_rope_kernels_multi_lora(
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 mp.set_start_method("spawn", force=True)
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -11,7 +11,7 @@ import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -1251,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )
    parser.add_argument(
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -5,7 +5,7 @@ import cProfile
 import pstats
 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -188,16 +188,60 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
 #
 # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 # Flag to enable ACL kernels for AARCH64 platforms
 if (VLLM_BUILD_ACL STREQUAL "ON")
    set(USE_ACL ON)
 else()
    set(USE_ACL OFF)
 endif()
 # Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
    if(ASIMD_FOUND)
        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
        else()
            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
            FetchContent_Populate(arm_compute
                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
                GIT_TAG        v52.2.0
                GIT_SHALLOW    TRUE
                GIT_PROGRESS   TRUE
            )
            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
        endif()
        # Build ACL with scons
        include(ProcessorCount)
        ProcessorCount(_NPROC)
        set(_scons_cmd
        scons -j${_NPROC}
            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
            multi_isa=1 openmp=1 cppthreads=0
        )
        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
        # and create a local shim dir with it
        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
        endif()
        execute_process(
            COMMAND ${_scons_cmd}
            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
            RESULT_VARIABLE _acl_rc
        )
        if(NOT _acl_rc EQUAL 0)
            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@@ -217,16 +261,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        )
    endif()
    if(USE_ACL)
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
        if(NOT ARM_COMPUTE_LIBRARY)
            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
    set(ONEDNN_BUILD_EXAMPLES "OFF")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
    )
    set(FlashMLA_INCLUDES
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@@ -22,10 +22,10 @@ else()
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
  FetchContent_Populate(qutlass)
  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()
 FetchContent_Populate(qutlass)
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()
 # Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
 #   libgomp.so    -> libgomp-<hash>.so...
 #   libgomp.so.1  -> libgomp-<hash>.so...
 # OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
 function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
  # Use run_python to locate vendored libgomp; never throw on failure.
  run_python(_VLLM_TORCH_GOMP_PATH
    "
 import os, glob
 try:
  import torch
  torch_pkg = os.path.dirname(torch.__file__)
  site_root = os.path.dirname(torch_pkg)
  torch_libs = os.path.join(site_root, 'torch.libs')
  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
 except:
  print('')
 "
    "failed to probe torch.libs for libgomp")
  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
    return()
  endif()
  # Create shim under the build tree
  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
  file(MAKE_DIRECTORY "${_shim}")
  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
 endfunction()
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -125,32 +125,37 @@ public:
  }
  static void set_split_kv (KernelArguments& args) {
    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
-    int max_splits = ceil_div(K, 128);
+    int max_splits = 1;
    max_splits = min(16, max_splits);
-    // TODO: This avoids a hang when the batch size larger than 1 and 
+    if (B <= 4 && seq_length_k >= 16) {
-    // there is more than 1 kv_splits. 
+      max_splits = 16;
-    // Discuss with NVIDIA how this can be fixed.
+    }
-    if (B > 1) {
+    else if (B <= 8 && seq_length_k >= 4) {
-      max_splits = min(1, max_splits);
+      max_splits = 8;
    }
    else if ((B <= 16 && seq_length_k >= 8) ||
             (B == 48 && seq_length_k >= 32)) {
      max_splits = 4;
    }
    else if ((B <= 32 && seq_length_k >= 16) ||
             (B == 96 && seq_length_k >= 16)) {
      max_splits = 2;
    }
    else {
      max_splits = 1;
    }
-    // printf("    max_splits = %d\n", max_splits);
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
    // printf("    args.split_kv = %d\n", args.split_kv);
  }
  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@@ -5,11 +5,11 @@
 namespace vllm {
-// vllm_kernel_override_batch_invariant(); returns true
+// vllm_is_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
+// if env VLLM_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
+inline bool vllm_is_batch_invariant() {
  static bool cached = []() {
-    std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
+    std::string env_key = "VLLM_BATCH_INVARIANT";
    const char* val = std::getenv(env_key.c_str());
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -187,7 +187,8 @@ template <>
 struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
  size_t operator()(
      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
           hash<int>()(static_cast<int>(val.b_type));
  }
 };
@@ -216,7 +217,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
 bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
-  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
         l.b_type == r.b_type;
 }
 bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
@@ -493,8 +495,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
    const MSizeCacheKey& key) {
  if (m_size_cache_.get() == nullptr) {
-    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+    ClassMatmulCacheKey class_key = {
-    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+        .b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
    m_size_cache_ =
        get_matul_class_primitive_cache(class_key, primitive_cache_size_);
  }
  return m_size_cache_->get_or_create(key, [&]() {
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -199,6 +199,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
  struct ClassMatmulCacheKey {
    dnnl_dim_t b_n_size;
    dnnl_dim_t b_k_size;
    dnnl::memory::data_type b_type;
    friend bool operator==(const ClassMatmulCacheKey& l,
                           const ClassMatmulCacheKey& r);
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -2,6 +2,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
  const scalar_t* input_row = input + blockIdx.x * input_stride;
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  constexpr int VEC_SIZE = 8;
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@@ -136,211 +148,6 @@ fused_add_rms_norm_kernel(
  }
 }
 /* Function specialization in the case of FP16/BF16 tensors.
   Additional optimizations we can make in this case are
   packed and vectorized operations, which help with the
   memory latency bottleneck.
   _f16VecPN struct extends _f16Vec to add operations specifically required for
   polynomial normalization (poly norm).
   The original _f16Vec does not include the sum-of-powers computation or
   in-place polynomial normalization logic. */
 template <typename scalar_t, int width>
 struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
  using Base = _f16Vec<scalar_t, width>;
  using Converter = typename Base::Converter;
  using T1 = typename Base::T1;
  using T2 = typename Base::T2;
  using Base::data;
  __device__ auto sum_pows() const {
    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x4 = x2 * x2;
      float x6 = x4 * x2;
      float y2 = z.y * z.y;
      float y4 = y2 * y2;
      float y6 = y4 * y2;
      s2 += x2 + y2;
      s4 += x4 + y4;
      s6 += x6 + y6;
    }
    return std::make_tuple(s2, s4, s6);
  }
  __device__ void poly_norm_inplace(const float w2_inv_std,
                                    const float w1_inv_std2,
                                    const float w0_inv_std3, const float bias) {
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x3 = x2 * z.x;
      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
      float y2 = z.y * z.y;
      float y3 = y2 * z.y;
      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
      auto out = Converter::convert(z);
      data[i] = out.x;
      data[i + 1] = out.y;
    }
  }
 };
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  // Sanity checks on our vector struct and type-punned pointer arithmetic
  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
  /* These and the argument pointers are all declared `restrict` as they are
     not aliased in practice. Argument pointers should not be dereferenced
     in this kernel as that would be undefined behavior */
  auto* __restrict__ input_v =
      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
  const int vec_hidden_size = hidden_size / width;
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    auto [x2, x4, x6] = temp.sum_pows();
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
    out_v[id] = temp;
  }
 }
 /* Generic poly_norm_kernel
   The width field is not used here but necessary for other specializations.
 */
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x4 = x2 * x2;
    float x6 = x4 * x2;
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x3 = x2 * x;
    out[blockIdx.x * hidden_size + idx] =
        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
                   s_bias);
  }
 }
 }  // namespace vllm
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@@ -352,18 +159,26 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
+
-  int64_t input_stride = input.stride(-2);
+  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
  // Instead, we use a 2d view to get the second-innermost stride.
  // That way the dimensions (except the last one) can be arbitrarily permuted.
  torch::Tensor input_view = input.view({-1, hidden_size});
  int num_tokens = input_view.numel() / hidden_size;
  int64_t input_stride = input_view.stride(-2);
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+  VLLM_DISPATCH_FLOATING_TYPES(
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
+        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-  });
+            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
            hidden_size);
      });
 }
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
@@ -380,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
                        double epsilon) {
  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  TORCH_CHECK(input.scalar_type() == residual.scalar_type());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
@@ -414,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@@ -422,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
 #define LAUNCH_FUSED_POLY_NORM(width)                                         \
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
        hidden_size);                                                         \
  });
 void poly_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [3]
               torch::Tensor& bias,    // [1]
               double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.data_ptr() != input.data_ptr());
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
  /* This kernel is memory-latency bound in many scenarios.
     When num_tokens is large, a smaller block size allows
     for increased block occupancy on CUs and better latency
     hiding on global mem ops. */
  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  /*If the tensor types are FP16/BF16, try to use the optimized kernel
    with packed + vectorized ops.
    Max optimization is achieved with a width-8 vector of FP16/BF16s
    since we can load at most 128 bits at once in a global memory op.
    However, this requires each tensor's data to be aligned to 16
    bytes.
   */
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
  }
 }
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -10,6 +10,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  __shared__ float s_variance;
  float variance = 0.0f;
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+
  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
    double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  int hidden_size = input.size(-1);
  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;
@@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml`