[BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036 )

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> (cherry picked from commit 8f4f77a727)
[BugFix] Ray with multiple nodes (#28873 )
2025-11-19 14:11:21 -08:00 · 2025-11-19 14:11:08 -08:00 · 2025-11-19 14:10:50 -08:00 · 2025-11-19 14:10:37 -08:00
1518 changed files with 35347 additions and 90602 deletions
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,24 +0,0 @@
 name: vllm_ci
 job_dirs:
  - ".buildkite/test_areas"
  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
  - "requirements/common.txt"
  - "requirements/cuda.txt"
  - "requirements/build.txt"
  - "requirements/test.txt"
  - "setup.py"
  - "csrc/"
  - "cmake/"
 run_all_exclude_patterns:
  - "docker/Dockerfile."
  - "csrc/cpu/"
  - "csrc/rocm/"
  - "cmake/hipify.py"
  - "cmake/cpu_extension.cmake"
 registries: public.ecr.aws/q9t5s3a7
 repositories:
  main: "vllm-ci-postmerge-repo"
  premerge: "vllm-ci-test-repo"
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,46 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os
 template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
    </body>
 </html>
 """
 parser = argparse.ArgumentParser()
 parser.add_argument("--wheel", help="The wheel path.", required=True)
 args = parser.parse_args()
 filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
    if "x86_64" in filename:
        x86_wheel = filename
        arm_wheel = filename.replace("x86_64", "aarch64").replace(
            "manylinux1", "manylinux2014"
        )
    elif "aarch64" in filename:
        x86_wheel = filename.replace("aarch64", "x86_64").replace(
            "manylinux2014", "manylinux1"
        )
        arm_wheel = filename
    else:
        raise ValueError(f"Unsupported wheel: {filename}")
    # cloudfront requires escaping the '+' character
    f.write(
        template.format(
            x86_wheel=x86_wheel,
            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
            arm_wheel=arm_wheel,
            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
        )
    )
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,56 +0,0 @@
 #!/bin/bash
 set -e
 if [[ $# -lt 8 ]]; then
  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
  exit 1
 fi
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
 CACHE_FROM=$7
 CACHE_TO=$8
 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
 # docker buildx 
 docker buildx create --name vllm-builder --driver docker-container --use
 docker buildx inspect --bootstrap
 docker buildx ls
 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi
 if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
 else
  merge_base_commit_build_args=""
 fi
 # build
 docker buildx build --file docker/Dockerfile \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg USE_SCCACHE=1 \
  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
  ${merge_base_commit_build_args} \
  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
  --cache-to type=registry,ref=${CACHE_TO},mode=max \
  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
  --push \
  --target test \
  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -1,57 +0,0 @@
 group: Abuild
 steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
    commands:
    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
  - label: ":docker: Build CPU image"
    key: image-build-cpu
    depends_on: []
    commands:
    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
  - label: ":docker: Build HPU image"
    soft_fail: true
    depends_on: []
    key: image-build-hpu
    commands:
    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
    optional: true
    commands:
    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -1,36 +0,0 @@
 #!/bin/bash
 set -e
 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .
 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -1,33 +0,0 @@
 #!/bin/bash
 set -e
 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .
 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -1,34 +0,0 @@
 #!/bin/bash
 set -e
 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi
 # build
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git
 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,4 +8,3 @@ tasks:
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
 rtol: 0.05
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -1 +0,0 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,40 +9,11 @@ pytest -s -v test_lm_eval_correctness.py \
    --tp-size=1
 """
 import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
-DEFAULT_RTOL = 0.08
+RTOL = 0.08
@contextmanager
 def scoped_env_vars(new_env: dict[str, str]):
    if not new_env:
        # Fast path: nothing to do
        yield
        return
    old_values = {}
    new_keys = []
    try:
        for key, value in new_env.items():
            if key in os.environ:
                old_values[key] = os.environ[key]
            else:
                new_keys.append(key)
            os.environ[key] = str(value)
        yield
    finally:
        # Restore / clean up
        for key, value in old_values.items():
            os.environ[key] = value
        for key in new_keys:
            os.environ.pop(key, None)
 def launch_lm_eval(eval_config, tp_size):
@@ -61,26 +32,23 @@ def launch_lm_eval(eval_config, tp_size):
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
    )
-
+    results = lm_eval.simple_evaluate(
-    env_vars = eval_config.get("env_vars", None)
+        model=backend,
-    with scoped_env_vars(env_vars):
+        model_args=model_args,
-        results = lm_eval.simple_evaluate(
+        tasks=[task["name"] for task in eval_config["tasks"]],
-            model=backend,
+        num_fewshot=eval_config["num_fewshot"],
-            model_args=model_args,
+        limit=eval_config["limit"],
-            tasks=[task["name"] for task in eval_config["tasks"]],
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-            num_fewshot=eval_config["num_fewshot"],
+        # text models. however, this is regressing measured strict-match for
-            limit=eval_config["limit"],
+        # existing text models in CI, so only apply it for mm, or explicitly set
-            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        apply_chat_template=eval_config.get(
-            # text models. however, this is regressing measured strict-match for
+            "apply_chat_template", backend == "vllm-vlm"
-            # existing text models in CI, so only apply it for mm, or explicitly set
+        ),
-            apply_chat_template=eval_config.get(
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-                "apply_chat_template", backend == "vllm-vlm"
+        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-            ),
+        gen_kwargs=eval_config.get("gen_kwargs"),
-            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+        batch_size=batch_size,
-            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+    )
            gen_kwargs=eval_config.get("gen_kwargs"),
            batch_size=batch_size,
        )
    return results
@@ -89,8 +57,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
    results = launch_lm_eval(eval_config, tp_size)
    rtol = eval_config.get("rtol", DEFAULT_RTOL)
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@@ -98,9 +64,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth:.3f} | "
+                f"ground_truth={ground_truth} | measured={measured_value}"
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,65 +108,6 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 #### Default Parameters Field
 We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
 <details>
 <summary> An Example of default parameters field </summary>
 ```json
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "server_environment_variables": {
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
    },
    "server_parameters": {
      "tensor_parallel_size": 1,
      "dtype": "bfloat16",
      "block_size": 128,
      "disable_log_stats": "",
      "load_format": "dummy"
    },
    "client_parameters": {
      "backend": "vllm",
      "dataset_name": "random",
      "random-input-len": 128,
      "random-output-len": 128,
      "num_prompts": 200,
      "ignore-eos": ""
    }
  },
  "tests": [
    {
      "test_name": "serving_llama3B_tp2_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 2,
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
      }
    },
    {
      "test_name": "serving_qwen3_tp4_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-14B",
        "tensor_parallel_size": 4,
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-14B",
      }
    },
  ]
 }
 ```
 </details>
 ### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,8 +110,7 @@ json2envs() {
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
-  local timeout_val="1200"
+  timeout 1200 bash -c '
  timeout "$timeout_val" bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
@@ -317,44 +316,12 @@ run_throughput_tests() {
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
-  jq -c '
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
    if type == "array" then
      # Plain format: test cases array
      .[]
    elif (type == "object" and has("tests")) then
      # merge the default parameters into each test cases
      . as $root
      | ($root.defaults // {}) as $d
      | ($root.tests // [])[]
      # default qps / max_concurrency from defaults if missing
      | .qps_list = (.qps_list // $d.qps_list)
      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
      # merge envs / params: test overrides defaults
      | .server_environment_variables =
          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
      | .server_parameters =
          (($d.server_parameters // {}) + (.server_parameters // {}))
      | .client_parameters =
          (($d.client_parameters // {}) + (.client_parameters // {}))
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -368,25 +335,20 @@ run_serving_tests() {
      continue
    fi
-    # get client and server arguments (after merged the default parameters)
+    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # max_concurrency_list (fallback to num_prompts if missing)
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-      max_concurrency_list="[$num_prompts]"
+        max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -0,0 +1,610 @@
 [
    {
        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,246 +1,276 @@
-{
+[
-  "defaults": {
+    {
-    "qps_list": [
+        "test_name": "serving_llama8B_tp1_sharegpt",
-      "inf"
+        "qps_list": [1, 4, 16, "inf"],
-    ],
+        "max_concurrency_list": [32],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
-    "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
-      "VLLM_CPU_KVCACHE_SPACE": 40
+        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 32
        }
    },
-    "server_parameters": {
+    {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+        "test_name": "serving_llama8B_tp2_sharegpt",
-      "tensor_parallel_size": 1,
+        "qps_list": [1, 4, 16, "inf"],
-      "dtype": "bfloat16",
+        "max_concurrency_list": [32],
-      "distributed_executor_backend": "mp",
+        "server_environment_variables": {
-      "block_size": 128,
+            "VLLM_RPC_TIMEOUT": 100000,
-      "trust_remote_code": "",
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "disable_log_stats": "",
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "enforce_eager": "",
+	    "VLLM_CPU_SGL_KERNEL": 1,
-      "max_num_batched_tokens": 2048,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
-      "max_num_seqs": 256,
+        },
-      "load_format": "dummy"
+        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 32
        }
    },
-    "client_parameters": {
+    {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+        "test_name": "serving_llama8B_tp1_random_128_128",
-      "backend": "vllm",
+        "qps_list": [1, 4, 16, "inf"],
-      "ignore-eos": "",
+        "max_concurrency_list": [32],
-      "num_prompts": 200
+        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    }
-  },
+]
  "tests": [
    {
      "test_name": "serving_llama8B_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp2_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,28 +8,13 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build arm64 wheel - CUDA 13.0"
    depends_on: ~
    id: build-wheel-arm64-cuda-13-0
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
  # aarch64 build
  - label: "Build arm64 CPU wheel"
    depends_on: ~
@@ -40,11 +25,24 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@@ -54,7 +52,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
@@ -67,21 +65,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 CPU wheel build
  - label: "Build x86 CPU wheel"
    depends_on: ~
    id: build-wheel-x86-cpu
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
@@ -125,6 +109,7 @@ steps:
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 To download and upload the image:
@@ -45,9 +45,8 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -1,400 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # do not complain about line length (for docstring)
 # ruff: noqa: E501
 import argparse
 import json
 import sys
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
 import regex as re
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
  <!-- {comment} -->
  <meta name="pypi:repository-version" content="1.0">
  <body>
 {items}
  </body>
 </html>
 """
@dataclass
 class WheelFileInfo:
    package_name: str
    version: str
    build_tag: str | None
    python_tag: str
    abi_tag: str
    platform_tag: str
    variant: str | None
    filename: str
 def parse_from_filename(file: str) -> WheelFileInfo:
    """
    Parse wheel file name to extract metadata.
    The format of wheel names:
        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
    Example:
        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
    """
    wheel_file_re = re.compile(
        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
    )
    match = wheel_file_re.match(file)
    if not match:
        raise ValueError(f"Invalid wheel file name: {file}")
    package_name = match.group("package_name")
    version = match.group("version")
    build_tag = match.group("build_tag")
    python_tag = match.group("python_tag")
    abi_tag = match.group("abi_tag")
    platform_tag = match.group("platform_tag")
    # extract variant from version
    variant = None
    if "dev" in version:
        ver_after_dev = version.split("dev")[-1]
        if "." in ver_after_dev:
            variant = ver_after_dev.split(".")[-1]
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
            version, variant = version.split("+")
    return WheelFileInfo(
        package_name=package_name,
        version=version,
        build_tag=build_tag,
        python_tag=python_tag,
        abi_tag=abi_tag,
        platform_tag=platform_tag,
        variant=variant,
        filename=file,
    )
 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
        name = name.strip("/").strip(".")
        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
 def generate_package_index_and_metadata(
    wheel_files: list[WheelFileInfo],
    wheel_base_dir: Path,
    index_base_dir: Path,
    comment: str = "",
 ) -> tuple[str, str]:
    """
    Generate package index HTML content for a specific package, linking to actual wheel files.
    """
    href_tags = []
    metadata = []
    for file in sorted(wheel_files, key=lambda x: x.filename):
        relative_path = (
            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
        )
        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
        # NOTE: this is AWS S3 specific behavior!
        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
        file_meta = asdict(file)
        file_meta["path"] = file_path_quoted
        metadata.append(file_meta)
    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
    metadata_str = json.dumps(metadata, indent=2)
    return index_str, metadata_str
 def generate_index_and_metadata(
    whl_files: list[str],
    wheel_base_dir: Path,
    index_base_dir: Path,
    default_variant: str | None = None,
    alias_to_default: str | None = None,
    comment: str = "",
 ):
    """
    Generate index for all wheel files.
    Args:
        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
        wheel_base_dir (Path): Base directory for wheel files.
        index_base_dir (Path): Base directory to store index files.
        default_variant (str | None): The default variant name, if any.
        alias_to_default (str | None): Alias variant name for the default variant, if any.
        comment (str | None): Optional comment to include in the generated HTML files.
    First, parse all wheel files to extract metadata.
    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.
    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.
    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.
    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            cu129/ # cu129 is actually the alias to default variant
                index.html
                vllm/
                    index.html
                    metadata.json
            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            ...
    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
        [
            {
                "package_name": "vllm",
                "version": "0.10.2rc2",
                "build_tag": null,
                "python_tag": "cp38",
                "abi_tag": "abi3",
                "platform_tag": "manylinux2014_aarch64",
                "variant": "cu129",
                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
            },
            ...
        ]
    """
    parsed_files = [parse_from_filename(f) for f in whl_files]
    if not parsed_files:
        print("No wheel files found, skipping index generation.")
        return
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
        variant = file.variant or "default"
        if variant not in variant_to_files:
            variant_to_files[variant] = []
        variant_to_files[variant].append(file)
    print(f"Found variants: {list(variant_to_files.keys())}")
    # sanity check for default variant
    if default_variant:
        if "default" in variant_to_files:
            raise ValueError(
                "All wheel files must have variant suffixes when `default_variant` is specified."
            )
        if default_variant not in variant_to_files:
            raise ValueError(
                f"Default variant '{default_variant}' not found among wheel files."
            )
    if alias_to_default:
        if "default" not in variant_to_files:
            # e.g. only some wheels are uploaded to S3 currently
            print(
                "[WARN] Alias to default variant specified, but no default variant found."
            )
        elif alias_to_default in variant_to_files:
            raise ValueError(
                f"Alias variant name '{alias_to_default}' already exists among wheel files."
            )
        else:
            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
            print(f"Alias variant '{alias_to_default}' created for default variant.")
    # Generate comment in HTML header
    comment_str = f" ({comment})" if comment else ""
    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
    # Generate index for each variant
    subdir_names = set()
    for variant, files in variant_to_files.items():
        if variant == "default":
            variant_dir = index_base_dir
        else:
            variant_dir = index_base_dir / variant
            subdir_names.add(variant)
        variant_dir.mkdir(parents=True, exist_ok=True)
        # gather all package names in this variant
        packages = set(f.package_name for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
            subdir_names = subdir_names.union(packages)
        else:
            # generate project list for this variant directly
            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
            with open(variant_dir / "index.html", "w") as f:
                f.write(project_list_str)
        for package in packages:
            # filter files belonging to this package only
            package_files = [f for f in files if f.package_name == package]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
                package_files, wheel_base_dir, package_dir, comment
            )
            with open(package_dir / "index.html", "w") as f:
                f.write(index_str)
            with open(package_dir / "metadata.json", "w") as f:
                f.write(metadata_str)
    # Generate top-level project list index
    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
    with open(index_base_dir / "index.html", "w") as f:
        f.write(project_list_str)
 if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
        --comment <comment_string> : (optional) comment string to include in generated HTML files
    """
    parser = argparse.ArgumentParser(
        description="Process nightly build wheel files to generate indices."
    )
    parser.add_argument(
        "--version",
        type=str,
        required=True,
        help="Version string for the current build (e.g., commit hash)",
    )
    parser.add_argument(
        "--current-objects",
        type=str,
        required=True,
        help="Path to JSON file containing current S3 objects listing in this version directory",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to store generated index files",
    )
    parser.add_argument(
        "--alias-to-default",
        type=str,
        default=None,
        help="Alias variant name for the default variant",
    )
    parser.add_argument(
        "--comment",
        type=str,
        default="",
        help="Optional comment string to include in generated HTML files",
    )
    args = parser.parse_args()
    version = args.version
    if "/" in version or "\\" in version:
        raise ValueError("Version string must not contain slashes.")
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)
    # Read current objects JSON
    with open(current_objects_path) as f:
        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
    # current_objects looks like from list_objects_v2 S3 API:
    """
    "Contents": [
        {
            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
            "LastModified": "2025-11-28T14:00:32+00:00",
            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 435649349,
            "StorageClass": "STANDARD"
        },
        ...
    ]
    """
    # Extract wheel file keys
    wheel_files = []
    for item in current_objects.get("Contents", []):
        key: str = item["Key"]
        if key.endswith(".whl"):
            wheel_files.append(key.split("/")[-1])  # only the filename is used
    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
    # keep only "official" files for a non-nightly version (specifed by cli args)
    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
    if PY_VERSION_RE.match(version):
        # upload-wheels.sh ensures no "dev" is in args.version
        wheel_files = list(
            filter(lambda x: version in x and "dev" not in x, wheel_files)
        )
        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
    else:
        print("Nightly version detected, keeping all wheel files.")
    # Generate index and metadata, assuming wheels and indices are stored as:
    # s3://vllm-wheels/{version}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
    wheel_base_dir = Path(output_dir).parent / version
    index_base_dir = Path(output_dir)
    generate_index_and_metadata(
        whl_files=wheel_files,
        wheel_base_dir=wheel_base_dir,
        index_base_dir=index_base_dir,
        default_variant=None,
        alias_to_default=args.alias_to_default,
        comment=args.comment.strip(),
    )
    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -1,68 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-0-16}
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
 export CMAKE_BUILD_PARALLEL_LEVEL=16
 # Setup cleanup
 remove_docker_container() {
    set -e;
    docker rm -f cpu-test || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image
 docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
 function cpu_tests() {
  set -e
  docker exec cpu-test bash -c "
    set -e
    pip list"
  # offline inference
  docker exec cpu-test bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
  # Run kernel tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
  # basic online serving
  docker exec cpu-test bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model Qwen/Qwen3-0.6B \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 2h bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,22 +25,20 @@ function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator tblib 
+    pip install sentence-transformers datamodel_code_generator
    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -73,11 +73,12 @@ function cpu_tests() {
    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
-  # Run AWQ/GPTQ test
+  # Note: disable it until supports V1
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  # Run AWQ test
-    set -e
+  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-    pytest -x -s -v \
+  #   set -e
-    tests/quantization/test_cpu_wna16.py"
+  #   pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,7 +74,6 @@ FROM ${BASE_IMAGE_NAME}
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV SOC_VERSION="ascend910b1"
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,10 +35,9 @@ docker run \
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
@@ -47,6 +46,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,11 +12,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
    exit 0
 fi
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,17 +17,7 @@ wait_for_server() {
 }
 MODEL="deepseek-ai/DeepSeek-V2-lite"
-
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -2,9 +2,9 @@
 set -euxo pipefail
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
+THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
-PORT=${3:-8040}
+PORT=${3:-8020}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
@@ -16,18 +16,8 @@ wait_for_server() {
    done'
 }
-MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+MODEL="QWen/Qwen3-30B-A3B-FP8"
-
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -46,14 +36,11 @@ for BACK in "${BACKENDS[@]}"; do
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
-    --tensor-parallel-size 4 \
+    --tensor-parallel-size 2 \
    --data-parallel-size 2 \
    --enable-expert-parallel \
    --enable-eplb \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,74 +0,0 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
 DATA_PARALLEL_SIZE=${4:-2}
 TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="QWen/Qwen3-30B-A3B-FP8"
 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 PY
  cleanup
  SERVER_PID=
  sleep 1
  PORT=$((PORT+1))
 done
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,28 +2,6 @@
 set -ex
 # ======== part 0: setup ========
 BUCKET="vllm-wheels"
 INDICES_OUTPUT_DIR="indices"
 DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
 PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 # detect if python3.10+ is available
 has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
 if [[ "$has_new_python" -eq 0 ]]; then
    # use new python from docker
    docker pull python:3-slim
    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
 fi
 echo "Using python interpreter: $PYTHON"
 echo "Python version: $($PYTHON --version)"
 # ========= part 1: collect, rename & upload the wheel ==========
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
@@ -32,76 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
-# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# Detect architecture and rename 'linux' to appropriate manylinux version
-# we also accept params as manylinux tag
+arch=$(uname -m)
-# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+if [[ $arch == "x86_64" ]]; then
-manylinux_version="${1:-manylinux_2_31}"
+    manylinux_version="manylinux1"
 elif [[ $arch == "aarch64" ]]; then
    manylinux_version="manylinux2014"
 else
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
    manylinux_version="manylinux1"
 fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 if [[ "$wheel" != *"linux"* ]]; then
  echo "Error: Wheel filename does not contain 'linux': $wheel"
  exit 1
 fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 echo "Renamed wheel to: $wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version in wheel: $version"
+echo "Version: $version"
 pure_version="${version%%+*}"
 echo "Pure version (without variant): $pure_version"
-# copy wheel to its own bucket
+normal_wheel="$wheel" # Save the original wheel filename
 aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# ========= part 2: generate and upload indices ==========
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
-# generate indices for all existing wheels in the commit directory
+if [[ $version == *dev* ]]; then
-# this script might be run multiple times if there are multiple variants being built
+    suffix="${version##*.}"
-# so we need to guarantee there is little chance for "TOCTOU" issues
+    if [[ $suffix == cu* ]]; then
-# i.e., one process is generating indices while another is uploading a new wheel
+        new_version="1.0.0.dev+${suffix}"
-# so we need to ensure no time-consuming operations happen below
+    else
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
    # use cp to keep both files in the artifacts directory
    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi
-# list all wheels in the commit directory
+# Upload the wheel to S3
-echo "Existing wheels on S3:"
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 ls "$S3_COMMIT_PREFIX"
 obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indicies for all existing wheels
+# generate index for this commit
-# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+if [[ $normal_wheel == *"cu129"* ]]; then
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
-    alias_arg=""
+    echo "Skipping index files for non-cu129 wheels"
 fi
-# HACK: we do not need regex module here, but it is required by pre-commit hook
+# generate index for nightly
-# To avoid any external dependency, we simply replace it back to the stdlib re module
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
-# copy indices to /<commit>/ unconditionally
+if [[ $normal_wheel == *"cu129"* ]]; then
-echo "Uploading indices to $S3_COMMIT_PREFIX"
+    # only upload index.html for cu129 wheels (default wheels) as it
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+    # is available on both x86 and arm64
-
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-# copy to /nightly/ only if it is on the main branch and not a PR 
+else
-if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Skipping index files for non-cu129 wheels"
    echo "Uploading indices to overwrite /nightly/"
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
-# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-if [[ "$version" != *"dev"* ]]; then
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
    echo "Re-generating indices for /$pure_version/"
    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/multimodal
@@ -61,31 +61,25 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config
 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@@ -117,9 +111,9 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
 - label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -193,7 +187,7 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
@@ -216,13 +210,12 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -255,9 +248,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@@ -313,25 +306,28 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 9min
+- label: Engine Test # 25min
-  timeout_in_minutes: 15
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
- label: V1 Test e2e + engine # 65min
+- label: V1 Test e2e + engine # 30min
-  timeout_in_minutes: 90
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
    - vllm/
@@ -344,9 +340,9 @@ steps:
 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@@ -394,34 +390,10 @@ steps:
  commands:
    - pytest -v -s v1/attention
 - label: Batch Invariance Tests (H100) # 10min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  grade: Blocking
+  # grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@@ -437,34 +409,29 @@ steps:
 - label: Examples Test # 30min
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -518,7 +485,7 @@ steps:
 - label: PyTorch Compilation Unit Tests # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -526,16 +493,21 @@ steps:
    - vllm/
    - tests/compile
  commands:
-  # Run unit tests defined directly under compile/,
+    - pytest -v -s compile/test_pass_manager.py
-  # not including subdirectories, which are usually heavier
+    - pytest -v -s compile/test_fusion.py
-  # tests covered elsewhere.
+    - pytest -v -s compile/test_fusion_attn.py
-  # Use `find` to launch multiple instances of pytest so that
+    - pytest -v -s compile/test_functionalization.py
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  #  - pytest -v -s compile/test_sequence_parallelism.py
  #  - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
    - pytest -v -s compile/test_aot_compile.py
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -543,11 +515,9 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  - pytest -v -s compile/test_basic_correctness.py
-  # as it is a heavy test that is covered in other steps.
+  - pytest -v -s compile/test_multimodal_compile.py
-  # Use `find` to launch multiple instances of pytest so that
+  - pytest -v -s compile/piecewise/
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 - label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
@@ -559,10 +529,10 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -591,7 +561,7 @@ steps:
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@@ -618,7 +588,7 @@ steps:
 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@@ -645,26 +615,6 @@ steps:
  commands:
    - pytest -v -s kernels/mamba
 - label: Kernels DeepGEMM Test (H100) # Nvidia-centric
 # Not replicating for CUTLAS & CuTe
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -723,18 +673,16 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
+- label: LM Eval Small Models # 15min
-  timeout_in_minutes: 75
+  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -747,9 +695,9 @@ steps:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
-  commands: # LMEval+Transcription WER check
+  commands: # LMEval
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/
+  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
@@ -761,7 +709,19 @@ steps:
    - vllm/
    - tests/tool_use
  commands:
-    - pytest -v -s tool_use
+    - pytest -v -s -m 'not cpu_test' tool_use
 - label: OpenAI-Compatible Tool Use (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
 #####  models test  #####
@@ -786,7 +746,6 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
@@ -932,18 +891,6 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test
 - label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
@@ -971,8 +918,8 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 180
+  timeout_in_minutes: 70
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@@ -984,8 +931,7 @@ steps:
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 # 60min
+- label: Multi-Modal Models Test (Extended) 1
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@@ -1009,8 +955,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3 # 75min
+- label: Multi-Modal Models Test (Extended) 3
  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@@ -1053,12 +998,12 @@ steps:
  optional: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -1102,9 +1047,8 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
+- label: Blackwell Fusion Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -1112,29 +1056,20 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 - label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
@@ -1151,16 +1086,20 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/test_fusions_e2e.py
  - tests/compile/test_full_graph.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  agent_pool: mi325_1
  mirror_hardwares: [amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -1169,7 +1108,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
@@ -1259,7 +1198,7 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
@@ -1269,11 +1208,10 @@ steps:
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1305,7 +1243,7 @@ steps:
 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1373,27 +1311,11 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
-
+    - pytest -v -s -x lora/test_gptoss_tp.py
    # Disabled for now because MXFP4 backend on non-cuda platform
    # doesn't support LoRA yet
    #- pytest -v -s -x lora/test_gptoss_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
@@ -1404,7 +1326,21 @@ steps:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  mirror_hardwares: [amdexperimental]
@@ -1440,13 +1376,12 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: LM Eval Large Models # optional
-  gpu: a100
+  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
@@ -1458,11 +1393,11 @@ steps:
 ##### H100 test #####
 - label: LM Eval Large Models (H100) # optional
-  gpu: h100
+  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
@@ -1472,7 +1407,6 @@ steps:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
  mirror_hardwares: [amdexperimental]
@@ -1483,14 +1417,12 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### B200 test #####
@@ -1504,57 +1436,6 @@ steps:
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### E2E Eval Tests #####
 - label: LM Eval Small Models (1 Card) # 15min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: ROCm LM Eval Large Models (8 Card)
  mirror_hardwares: [amdproduction]
  agent_pool: mi325_8
  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
 - label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  agent_pool: mi325_1
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
  mirror_hardwares: [amdexperimental]
@@ -1569,8 +1450,9 @@ steps:
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@@ -1581,8 +1463,8 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+- label: Qwen3-30B-A3B-FP8-block Accuracy
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@@ -1591,36 +1473,4 @@ steps:
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,16 +57,14 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@@ -75,8 +73,6 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config
@@ -171,7 +167,7 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
@@ -196,13 +192,12 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -280,18 +275,21 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 9min
+- label: Engine Test # 25min
-  timeout_in_minutes: 15
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@@ -348,19 +346,6 @@ steps:
  commands:
    - pytest -v -s v1/attention
 - label: Batch Invariance Tests (H100) # 10min
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
@@ -390,28 +375,23 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -465,14 +445,18 @@ steps:
    - vllm/
    - tests/compile
  commands:
-  # Run unit tests defined directly under compile/,
+    - pytest -v -s compile/test_graph_partition.py
-  # not including subdirectories, which are usually heavier
+    - pytest -v -s compile/test_config.py
-  # tests covered elsewhere.
+    - pytest -v -s compile/test_pass_manager.py
-  # Use `find` to launch multiple instances of pytest so that
+    - pytest -v -s compile/test_fusion.py
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+    - pytest -v -s compile/test_fusion_attn.py
-  # However, find does not normally propagate error codes, so we combine it with xargs
+    - pytest -v -s compile/test_functionalization.py
-  # (using -0 for proper path handling)
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
    - pytest -v -s compile/test_aot_compile.py
    - pytest -v -s compile/test_qk_norm_rope_fusion.py
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -482,13 +466,9 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  - pytest -v -s compile/test_basic_correctness.py
-  # as it is a heavy test that is covered in other steps.
+  - pytest -v -s compile/test_multimodal_compile.py
-  # Use `find` to launch multiple instances of pytest so that
+  - pytest -v -s compile/piecewise/
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 - label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
@@ -499,10 +479,10 @@ steps:
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -574,25 +554,6 @@ steps:
  commands:
    - pytest -v -s kernels/mamba
 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -643,7 +604,6 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
@@ -674,7 +634,16 @@ steps:
    - vllm/
    - tests/tool_use
  commands:
-    - pytest -v -s tool_use
+    - pytest -v -s -m 'not cpu_test' tool_use
 - label: OpenAI-Compatible Tool Use (CPU) # 5 mins
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
 #####  models test  #####
@@ -685,7 +654,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -696,9 +664,7 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@@ -824,24 +790,14 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU)
+- label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Processor Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing
 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@@ -918,15 +874,14 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
+    # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -970,7 +925,6 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -980,29 +934,22 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 - label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
@@ -1019,11 +966,12 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/test_fusions_e2e.py
  - tests/compile/test_full_graph.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -1121,7 +1069,7 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
@@ -1133,11 +1081,10 @@ steps:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1223,8 +1170,6 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # Alot of these tests are on the edge of OOMing
    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
@@ -1319,11 +1264,11 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1343,7 +1288,6 @@ steps:
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
@@ -1361,20 +1305,11 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+- label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -1,21 +0,0 @@
 group: Attention
 depends_on: 
  - image-build
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention
 - label: V1 attention (B200)
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -1,16 +0,0 @@
 group: Basic Correctness
 depends_on: 
  - image-build
 steps:
 - label: Basic Correctness
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -1,19 +0,0 @@
 group: Benchmarks
 depends_on: 
  - image-build
 steps:
 - label: Benchmarks
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  commands:
  - bash scripts/run-benchmarks.sh
 - label: Benchmarks CLI Test
  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -1,57 +0,0 @@
 group: Compile
 depends_on: 
  - image-build
 steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  num_gpus: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -1,22 +0,0 @@
 group: CUDA
 depends_on: 
  - image-build
 steps:
 - label: Platform Tests (CUDA)
  timeout_in_minutes: 15
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
 - label: Cudagraph
  timeout_in_minutes: 20
  source_file_dependencies:
  - tests/v1/cudagraph
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - vllm/compilation
  commands:
    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -1,199 +0,0 @@
 group: Distributed
 depends_on: 
  - image-build
 steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py
 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
  - tests/v1/distributed
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=4 and dp=1
  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2, pp=2 and dp=1
  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=1 and dp=4 with ep
  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2 and dp=2 with ep
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
  gpu: h100
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: Distributed Tests (4 GPUs)(A100)
  gpu: a100
  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: Distributed Tests (2 GPUs)(H200)
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
 - label: Distributed Tests (2 GPUs)(B200)
  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
    - pytest -v -s tests/v1/distributed/test_dbo.py
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -1,59 +0,0 @@
 group: E2E Integration
 depends_on: 
  - image-build
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 - label: Prime-RL Integration (2 GPUs)
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,26 +0,0 @@
 group: Engine
 depends_on: 
  - image-build
 steps:
 - label: Engine
  timeout_in_minutes: 15
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 - label: V1 e2e + engine
  timeout_in_minutes: 45
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -1,68 +0,0 @@
 group: Entrypoints
 depends_on: 
  - image-build
 steps:
 - label: Entrypoints Unit Tests  
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/entrypoints
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 - label: Entrypoints Integration (LLM)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Entrypoints Integration (API Server)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py
 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
 - label: Entrypoints V1
  timeout_in_minutes: 50
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
 - label: OpenAI API Correctness
  timeout_in_minutes: 30
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -1,23 +0,0 @@
 group: Expert Parallelism
 depends_on: 
  - image-build
 steps:
 - label: EPLB Algorithm
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_algo.py
  commands:
  - pytest -v -s distributed/test_eplb_algo.py
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -1,117 +0,0 @@
 group: Kernels
 depends_on: 
  - image-build
 steps:
 - label: Kernels Core Operation Test
  timeout_in_minutes: 75
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Quantization Test %N
  timeout_in_minutes: 90
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels MoE Test %N
  timeout_in_minutes: 60
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  - vllm/envs.py
  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Mamba Test
  timeout_in_minutes: 45
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba
 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
  - vllm/platforms/cuda.py
  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -1,46 +0,0 @@
 group: LM Eval
 depends_on: 
  - image-build
 steps:
 - label: LM Eval Small Models
  timeout_in_minutes: 75
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: LM Eval Large Models (4 GPUs)(A100)
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -1,31 +0,0 @@
 group: LoRA
 depends_on: 
  - image-build
 steps:
 - label: LoRA %N
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
    - pytest -v -s -x lora/test_gptoss_tp.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -1,165 +0,0 @@
 group: Miscellaneous
 depends_on: 
  - image-build
 steps:
 - label: V1 Others
  timeout_in_minutes: 60
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
    - pytest -v -s v1/test_outputs.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: V1 Others (CPU)
  depends_on: ~
  source_file_dependencies:
    - vllm/
    - tests/v1
  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'cpu_test' v1/metrics
 - label: Regression
  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
 - label: Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    - python3 offline_inference/basic/chat.py # for basic
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
  commands:
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
  - pytest -v -s v1/tracing
 - label: Python-only Installation
  depends_on: ~
  timeout_in_minutes: 20
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh
 - label: Async Engine, Inputs, Utils, Worker
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
  depends_on: ~
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -1,17 +0,0 @@
 group: Model Executor
 depends_on: 
  - image-build
 steps:
 - label: Model Executor
  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/engine/arg_utils.py
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -1,62 +0,0 @@
 group: Models - Basic
 depends_on: 
  - image-build
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
    # test.) Also run if model initialization test file is modified
    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2
 - label: Basic Models Tests (Other)
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_transformers.py models/test_registry.py
 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py
 - label: Transformers Nightly Models
  working_dir: "/vllm-workspace/"
  optional: true
  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -1,22 +0,0 @@
 group: Models - Distributed
 depends_on: 
  - image-build
 steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -1,91 +0,0 @@
 group: Models - Language
 depends_on: 
  - image-build
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
    # Test standard language models, excluding a subset of slow tests
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m 'core_model and (not slow_test)'
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/language/pooling/test_embedding.py
  - tests/models/language/generation/test_common.py
  - tests/models/language/pooling/test_classification.py
  commands:
    # Shard slow subset of standard language models tests. Only run when model
    # source is modified, or when specified test files are modified
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2
 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation_ppl_test
  commands:
    - pytest -v -s models/language/generation_ppl_test
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling_mteb_test
  commands:
    - pytest -v -s models/language/pooling_mteb_test
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -1,79 +0,0 @@
 group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
 - label: Multi-Modal Models (Standard) # 60min
  timeout_in_minutes: 80
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Processor # 44min
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models (Extended) 1
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 - label: Multi-Modal Models (Extended) 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 - label: Multi-Modal Models (Extended) 3
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -1,34 +0,0 @@
 group: Plugins
 depends_on: 
  - image-build
 steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -1,50 +0,0 @@
 group: PyTorch
 depends_on: 
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
  # Run unit tests defined directly under compile/,
  # not including subdirectories, which are usually heavier
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
 - label: PyTorch Fullgraph Smoke Test
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  # Run smoke tests under fullgraph directory, except test_full_graph.py
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
 - label: PyTorch Fullgraph
  timeout_in_minutes: 40
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -1,46 +0,0 @@
 group: Quantization
 depends_on: 
  - image-build
 steps:
 - label: Quantization
  timeout_in_minutes: 90
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Quantized Models Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  commands:
    - pytest -v -s models/quantization
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -1,14 +0,0 @@
 group: Samplers
 depends_on: 
  - image-build
 steps:
 - label: Samplers Test
  timeout_in_minutes: 75
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  - tests/conftest.py
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -1,13 +0,0 @@
 group: Tool use
 depends_on: 
  - image-build
 steps:
 - label: OpenAI-Compatible Tool Use
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s tool_use
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -1,25 +0,0 @@
 group: Weight Loading
 depends_on: 
  - image-build
 steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,7 +9,6 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -36,9 +35,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
@@ -60,7 +56,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 /tests/v1/determinism @yewentao256 
 # Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
@@ -146,10 +141,9 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche
 # Pooling models
-/examples/pooling @noooop
+/examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,52 +14,6 @@ pull_request_rules:
    comment:
      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 - name: comment-pre-commit-failure
  description: Comment on PR when pre-commit check fails
  conditions:
    - status-failure=pre-commit
    - -closed
    - -draft
  actions:
    comment:
      message: |
        Hi @{{author}}, the pre-commit checks have failed. Please run:
        ```bash 
        uv pip install pre-commit
        pre-commit install
        pre-commit run --all-files
        ```
        Then, commit the changes and push to your branch.
        For future commits, `pre-commit` will run automatically on changed files before each commit.
        > [!TIP]
        > <details>
        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
        > <br/>
        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>
 - name: comment-dco-failure
  description: Comment on PR when DCO check fails
  conditions:
    - status-failure=dco
    - -closed
    - -draft
  actions:
    comment:
      message: |
        Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
@@ -186,7 +140,7 @@ pull_request_rules:
      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
+      - files~=^vllm/entrypoints/harmony_utils.py
      - files~=^vllm/entrypoints/tool_server.py
      - files~=^vllm/entrypoints/tool.py
      - files~=^vllm/entrypoints/context.py
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,31 +105,6 @@ jobs:
                  }
                ],
              },
              cpu: {
                // Keyword search - matches whole words only (with word boundaries)
                keywords: [
                  {
                    term: "CPU Backend",
                    searchIn: "title"
                  },
                  {
                    term: "x86",
                    searchIn: "title"
                  },
                  {
                    term: "ARM",
                    searchIn: "title"
                  },
                  {
                    term: "Apple Silicon",
                    searchIn: "title"
                  },
                  {
                    term: "IBM Z",
                    searchIn: "title"
                  },
                ],
              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -9,10 +9,10 @@ on:
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
-    timeout-minutes: 30
+    timeout-minutes: 20
    steps:
-      - uses: actions/checkout@v6.0.1
+      - uses: actions/checkout@v4
      - uses: astral-sh/setup-uv@v7
        with:
@@ -37,14 +37,15 @@ jobs:
      - name: Verify installation
        run: |
          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
      - name: Smoke test vllm serve
        timeout-minutes: 10
        run: |
          # Start server in background
          vllm serve Qwen/Qwen3-0.6B \
-            --max-model-len=2K \
+            --max-model-len=2048 \
            --load-format=dummy \
            --hf-overrides '{"num_hidden_layers": 2}' \
            --enforce-eager \
            --port 8000 &
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,15 +7,13 @@ on:
 jobs:
  close-issues-and-pull-requests:
    # Prevents triggering on forks or other repos
    if: github.repository == 'vllm-project/vllm'
    permissions:
      issues: write
      pull-requests: write
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
+      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.gitignore
+++ b/.gitignore
@@ -4,9 +4,6 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
 # triton jit
 .triton
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1")
+  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -354,17 +354,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-
+  # 9.0 for latest bf16 atomicAdd PTX
-  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  # marlin arches for fp8 input
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    #
@@ -374,18 +365,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$ENV{PYTHONPATH}
+        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -398,15 +387,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
-        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -414,34 +403,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    if (MARLIN_FP8_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
      endif()
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
    endif()
    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
@@ -637,15 +604,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -822,7 +786,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
@@ -874,10 +838,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
       )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -947,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -976,15 +938,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # moe marlin arches
+  # 9.0 for latest bf16 atomicAdd PTX
-  # note that we always set `use_atomic_add=False` for moe marlin now,
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  # so we don't need 9.0 for bf16 atomicAdd PTX
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
  # moe marlin arches for fp8 input
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    #
@@ -994,18 +949,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$ENV{PYTHONPATH}
+        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -1018,7 +971,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
@@ -1026,28 +979,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_MOE_SRC}
+      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
-    if (MARLIN_MOE_FP8_ARCHS)
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_FP8_SRC}"
        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
      endif()
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
    endif()
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
@@ -1089,11 +1030,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    WITH_SOABI)
 endif()
 # For CUDA and HIP builds also build the triton_kernels external package.
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
    include(cmake/external_projects/triton_kernels.cmake)
 endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
--- a/README.md
+++ b/README.md
@@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
 - [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
@@ -137,19 +136,16 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
 - Arm
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
 - Google Cloud
 - IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
-### 2. Maximize Throughput with a Latency Requirement
+#### 2. Maximize Throughput with a Latency Requirement
 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```
-### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,11 +18,6 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
 MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
 NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 HOSTNAME=$(hostname)
 if [[ -z "$HOSTNAME" ]]; then
    echo "Error: Failed to determine hostname." >&2
    exit 1
 fi
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
@@ -87,7 +82,6 @@ start_server() {
        "$MODEL"
        "--disable-log-requests"
        "--port" "8004"
        "--host" "$HOSTNAME"
        "--gpu-memory-utilization" "$gpu_memory_utilization"
        "--max-num-seqs" "$max_num_seqs"
        "--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -102,9 +96,8 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_SERVER_DEV_MODE=1 \
@@ -119,7 +112,7 @@ start_server() {
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break
-        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
            server_started=1
@@ -179,7 +172,6 @@ run_benchmark() {
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
        --port 8004 &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -195,7 +187,7 @@ run_benchmark() {
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
-            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            vllm bench serve \
@@ -211,7 +203,6 @@ run_benchmark() {
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
                --random-prefix-len $prefix_len \
                --host "$HOSTNAME" \
                --port 8004 &> "$bm_log"
            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -312,7 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
        --port 8004 \
        --profile &> "$bm_log"
 else
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
-            from vllm.tokenizers.mistral import MistralTokenizer
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
@@ -1,120 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
 This focuses on a single test payload shaped like the prefix-cache hash input:
    (32-byte bytes object, 32-int tuple)
 Usage:
    python benchmarks/hash_micro_benchmark.py --iterations 20000
 """
 from __future__ import annotations
 import argparse
 import random
 import statistics
 import time
 from collections.abc import Callable, Iterable
 from vllm.utils.hashing import sha256, xxhash
 def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
    """Generate a deterministic test payload."""
    random.seed(seed)
    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
    return (bytes_data, int_tuple)
 def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
    times: list[float] = []
    # Warm-up to avoid first-run noise.
    for _ in range(200):
        func(data)
    for _ in range(iterations):
        start = time.perf_counter()
        func(data)
        end = time.perf_counter()
        times.append(end - start)
    avg = statistics.mean(times)
    std = statistics.stdev(times) if len(times) > 1 else 0.0
    return avg, std
 def _run_benchmarks(
    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
    data: tuple,
    iterations: int,
 ):
    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
    for name, func in benchmarks:
        try:
            avg, std = _benchmark_func(func, data, iterations)
        except ModuleNotFoundError as exc:
            print(f"Skipping {name}: {exc}")
            continue
        yield name, avg, std
 def builtin_hash(data: tuple) -> int:
    """Wrapper for Python's built-in hash()."""
    return hash(data)
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--iterations",
        type=int,
        default=10_000,
        help="Number of measured iterations per hash function.",
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="Random seed for test payload."
    )
    args = parser.parse_args()
    data = _generate_test_data(args.seed)
    benchmarks = (
        ("SHA256 (pickle)", sha256),
        ("xxHash (pickle)", xxhash),
        ("built-in hash()", builtin_hash),
    )
    print("=" * 60)
    print("HASH FUNCTION MICRO BENCHMARK")
    print("=" * 60)
    print("Test data: (32-byte bytes object, 32-int tuple)")
    print(f"Iterations: {args.iterations:,}")
    print("=" * 60)
    results = list(_run_benchmarks(benchmarks, data, args.iterations))
    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
    print("\nResults:")
    for name, avg, std in results:
        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
    if builtin_entry:
        _, builtin_avg, _ = builtin_entry
        print("\n" + "=" * 60)
        print("SUMMARY (relative to built-in hash())")
        print("=" * 60)
        for name, avg, _ in results:
            if name == "built-in hash()":
                continue
            speed_ratio = avg / builtin_avg
            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
    else:
        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
 if __name__ == "__main__":
    main()
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,11 +32,12 @@ def benchmark_propose(args):
        model_config = ModelConfig(
            model="facebook/opt-125m",
            task="generate",
            max_model_len=args.num_token + args.num_spec_token,
            tokenizer="facebook/opt-125m",
            tokenizer_mode="auto",
            dtype="auto",
-            seed=0,
+            seed=None,
            trust_remote_code=False,
        )
        proposer = NgramProposer(
@@ -107,10 +108,7 @@ def benchmark_batched_propose(args):
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(
+        scheduler_config=SchedulerConfig(),
            max_model_len=model_config.max_model_len,
            is_encoder_decoder=model_config.is_encoder_decoder,
        ),
    )
    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
--- a/benchmarks/benchmark_prefix_block_hash.py
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -1,110 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simple benchmark to compare prefix-cache block hashing algorithms.
 Example:
    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
 """
 from __future__ import annotations
 import argparse
 import random
 import statistics
 import sys
 import time
 from collections.abc import Callable, Iterable, Sequence
 from vllm.utils.hashing import get_hash_fn_by_name
 from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
 SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
 def _generate_blocks(
    num_blocks: int, block_size: int, vocab_size: int, seed: int
 ) -> list[list[int]]:
    rng = random.Random(seed)
    return [
        [rng.randrange(vocab_size) for _ in range(block_size)]
        for _ in range(num_blocks)
    ]
 def _hash_all_blocks(
    hash_fn: Callable[[object], bytes],
    blocks: Iterable[Sequence[int]],
 ) -> float:
    parent_hash: BlockHash | None = None
    start = time.perf_counter()
    for block in blocks:
        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
    end = time.perf_counter()
    return end - start
 def _benchmark(
    hash_algo: str,
    blocks: list[list[int]],
    trials: int,
 ) -> tuple[float, float, float] | None:
    try:
        hash_fn = get_hash_fn_by_name(hash_algo)
        init_none_hash(hash_fn)
        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
    except ModuleNotFoundError as exc:
        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
        return None
    avg = statistics.mean(timings)
    best = min(timings)
    # throughput: tokens / second
    tokens_hashed = len(blocks) * len(blocks[0])
    throughput = tokens_hashed / best
    return avg, best, throughput
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
    parser.add_argument(
        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
    parser.add_argument(
        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
    )
    parser.add_argument(
        "--algorithms",
        nargs="+",
        default=SUPPORTED_ALGOS,
        choices=SUPPORTED_ALGOS,
        help="Hash algorithms to benchmark.",
    )
    args = parser.parse_args()
    blocks = _generate_blocks(
        args.num_blocks, args.block_size, args.vocab_size, args.seed
    )
    print(
        f"Benchmarking {len(args.algorithms)} algorithms on "
        f"{args.num_blocks} blocks (block size={args.block_size})."
    )
    for algo in args.algorithms:
        result = _benchmark(algo, blocks, args.trials)
        if result is None:
            continue
        avg, best, throughput = result
        print(
            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
            f"throughput: {throughput / 1e6:.2f}M tokens/s"
        )
 if __name__ == "__main__":
    main()
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 try:
-    from vllm.tokenizers import get_tokenizer
+    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 try:
-    from vllm.tokenizers import get_tokenizer
+    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer
@@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total token throughput (tok/s):", metrics.total_token_throughput
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
        )
    )
@@ -963,7 +963,8 @@ def create_argument_parser():
    parser.add_argument(
        "--profile",
        action="store_true",
-        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
+        help="Use Torch Profiler. The endpoint must be launched with "
        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
    )
    parser.add_argument(
        "--result-dir",
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -5,12 +5,11 @@ import argparse
 import asyncio
 import logging
 import os
 import time
 import uuid
 from urllib.parse import urlparse
 import aiohttp
 from quart import Quart, Response, make_response, request
 from rate_limiter import RateLimiter
 from request_queue import RequestQueue
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -25,8 +24,26 @@ def parse_args():
    parser.add_argument(
        "--timeout",
        type=float,
-        default=6 * 60 * 60,
+        default=300,
-        help="Timeout for backend service requests in seconds (default: 21600)",
+        help="Timeout for backend service requests in seconds (default: 300)",
    )
    parser.add_argument(
        "--max-concurrent",
        type=int,
        default=100,
        help="Maximum concurrent requests to backend services (default: 100)",
    )
    parser.add_argument(
        "--queue-size",
        type=int,
        default=500,
        help="Maximum number of requests in the queue (default: 500)",
    )
    parser.add_argument(
        "--rate-limit",
        type=int,
        default=40,
        help="Maximum requests per second (default: 40)",
    )
    parser.add_argument(
        "--port",
@@ -37,32 +54,14 @@ def parse_args():
    parser.add_argument(
        "--prefill-url",
        type=str,
-        default="http://localhost:8100",
+        default="http://localhost:8100/v1/completions",
-        help="Prefill service base URL (protocol + host[:port])",
+        help="Prefill service endpoint URL",
    )
    parser.add_argument(
        "--decode-url",
        type=str,
-        default="http://localhost:8200",
+        default="http://localhost:8200/v1/completions",
-        help="Decode service base URL (protocol + host[:port])",
+        help="Decode service endpoint URL",
    )
    parser.add_argument(
        "--kv-host",
        type=str,
        default="localhost",
        help="Hostname or IP used by KV transfer (default: localhost)",
    )
    parser.add_argument(
        "--prefill-kv-port",
        type=int,
        default=14579,
        help="Prefill KV port (default: 14579)",
    )
    parser.add_argument(
        "--decode-kv-port",
        type=int,
        default=14580,
        help="Decode KV port (default: 14580)",
    )
    return parser.parse_args()
@@ -74,129 +73,70 @@ def main():
    # Initialize configuration using command line parameters
    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
    MAX_CONCURRENT_REQUESTS = args.max_concurrent
    REQUEST_QUEUE_SIZE = args.queue_size
    RATE_LIMIT = args.rate_limit
    PREFILL_SERVICE_URL = args.prefill_url
    DECODE_SERVICE_URL = args.decode_url
    PORT = args.port
    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
    logger.info(
        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
        PREFILL_KV_ADDR,
        DECODE_KV_ADDR,
    )
    app = Quart(__name__)
-    # Attach the configuration object to the application instance so helper
+    # Initialize the rate limiter and request queue
-    # coroutines can read the resolved backend URLs and timeouts without using
+    rate_limiter = RateLimiter(RATE_LIMIT)
-    # globals.
+    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
    # Attach the configuration object to the application instance
    app.config.update(
        {
            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
            "rate_limiter": rate_limiter,
            "request_queue": request_queue,
            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
            "DECODE_KV_ADDR": DECODE_KV_ADDR,
        }
    )
-    def _normalize_base_url(url: str) -> str:
+    # Start queue processing on app startup
-        """Remove any trailing slash so path joins behave predictably."""
+    @app.before_serving
-        return url.rstrip("/")
+    async def startup():
        """Start request processing task when app starts serving"""
        asyncio.create_task(request_queue.process())
-    def _get_host_port(url: str) -> str:
+    async def forward_request(url, data):
-        """Return the hostname:port portion for logging and KV headers."""
+        """Forward request to backend service with rate limiting and error handling"""
-        parsed = urlparse(url)
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
        host = parsed.hostname or "localhost"
        port = parsed.port
        if port is None:
            port = 80 if parsed.scheme == "http" else 443
        return f"{host}:{port}"
-    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+        # Use rate limiter as context manager
-    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+        async with (
-    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+            rate_limiter,
-
+            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-    def _build_headers(request_id: str) -> dict[str, str]:
+        ):
-        """Construct the headers expected by vLLM's P2P disagg connector."""
+            try:
-        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+                async with session.post(
-        api_key = os.environ.get("OPENAI_API_KEY")
+                    url=url, json=data, headers=headers
-        if api_key:
+                ) as response:
-            headers["Authorization"] = f"Bearer {api_key}"
+                    if response.status == 200:
-        return headers
+                        # Stream response chunks
-
+                        async for chunk_bytes in response.content.iter_chunked(1024):
-    async def _run_prefill(
+                            yield chunk_bytes
-        request_path: str,
+                    else:
-        payload: dict,
+                        # Handle backend service errors
-        headers: dict[str, str],
+                        error_text = await response.text()
-        request_id: str,
+                        logger.error(
-    ):
+                            "Backend service error: %s - %s",
-        url = f"{PREFILL_BASE}{request_path}"
+                            response.status,
-        start_ts = time.perf_counter()
+                            error_text,
-        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+                        )
-        try:
+                        yield b'{"error": "Backend service error"}'
-            async with (
+            except aiohttp.ClientError as e:
-                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                # Handle connection errors
-                session.post(url=url, json=payload, headers=headers) as resp,
+                logger.error("Connection error to %s: %s", url, str(e))
-            ):
+                yield b'{"error": "Service unavailable"}'
-                if resp.status != 200:
+            except asyncio.TimeoutError:
-                    error_text = await resp.text()
+                # Handle timeout errors
-                    raise RuntimeError(
+                logger.error("Timeout connecting to %s", url)
-                        f"Prefill backend error {resp.status}: {error_text}"
+                yield b'{"error": "Service timeout"}'
                    )
                await resp.read()
                logger.info(
                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
                    request_id,
                    resp.status,
                    time.perf_counter() - start_ts,
                )
        except asyncio.TimeoutError as exc:
            raise RuntimeError(f"Prefill service timeout at {url}") from exc
        except aiohttp.ClientError as exc:
            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
    async def _stream_decode(
        request_path: str,
        payload: dict,
        headers: dict[str, str],
        request_id: str,
    ):
        url = f"{DECODE_BASE}{request_path}"
        # Stream tokens from the decode service once the prefill stage has
        # materialized KV caches on the target workers.
        logger.info("[decode] start request_id=%s url=%s", request_id, url)
        try:
            async with (
                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
                session.post(url=url, json=payload, headers=headers) as resp,
            ):
                if resp.status != 200:
                    error_text = await resp.text()
                    logger.error(
                        "Decode backend error %s - %s", resp.status, error_text
                    )
                    err_msg = (
                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
                    )
                    yield err_msg.encode()
                    return
                logger.info(
                    "[decode] streaming response request_id=%s status=%s",
                    request_id,
                    resp.status,
                )
                async for chunk_bytes in resp.content.iter_chunked(1024):
                    yield chunk_bytes
                logger.info("[decode] finished streaming request_id=%s", request_id)
        except asyncio.TimeoutError:
            logger.error("Decode service timeout at %s", url)
            yield b'{"error": "Decode service timeout"}'
        except aiohttp.ClientError as exc:
            logger.error("Decode service error at %s: %s", url, exc)
            yield b'{"error": "Decode service unavailable"}'
    async def process_request():
        """Process a single request through prefill and decode stages"""
@@ -206,27 +146,13 @@ def main():
            # Create prefill request (max_tokens=1)
            prefill_request = original_request_data.copy()
            prefill_request["max_tokens"] = 1
            if "max_completion_tokens" in prefill_request:
                prefill_request["max_completion_tokens"] = 1
            # Execute prefill stage
-            # The request id encodes both KV socket addresses so the backend can
+            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-            # shuttle tensors directly via NCCL once the prefill response
+                continue
            # completes.
            request_id = (
                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
            )
            headers = _build_headers(request_id)
            await _run_prefill(request.path, prefill_request, headers, request_id)
            # Execute decode stage and stream response
-            # Pass the unmodified user request so the decode phase can continue
+            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
            # sampling with the already-populated KV cache.
            generator = _stream_decode(
                request.path, original_request_data, headers, request_id
            )
            response = await make_response(generator)
            response.timeout = None  # Disable timeout for streaming response
            return response
@@ -242,10 +168,23 @@ def main():
    @app.route("/v1/completions", methods=["POST"])
    async def handle_request():
        """Handle incoming API requests with concurrency and rate limiting"""
        # Create task for request processing
        task = asyncio.create_task(process_request())
        # Enqueue request or reject if queue is full
        if not await request_queue.enqueue(task):
            return Response(
                response=b'{"error": "Server busy, try again later"}',
                status=503,
                content_type="application/json",
            )
        try:
-            return await process_request()
+            # Return the response from the processing task
            return await task
        except asyncio.CancelledError:
-            logger.warning("Request cancelled")
+            # Handle task cancellation (timeout or queue full)
            logger.warning("Request cancelled due to timeout or queue full")
            return Response(
                response=b'{"error": "Request cancelled"}',
                status=503,
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,9 +14,6 @@ from tqdm import tqdm
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
 )
@dataclass
@@ -25,7 +22,6 @@ class bench_params_t:
    hidden_size: int
    add_residual: bool
    dtype: torch.dtype
    group_size: list[int]
    def description(self):
        return (
@@ -33,7 +29,6 @@ class bench_params_t:
            f"x D {self.hidden_size} "
            f"x R {self.add_residual} "
            f"x DT {self.dtype}"
            f"x GS {self.group_size}"
        )
@@ -43,11 +38,10 @@ def get_bench_params() -> list[bench_params_t]:
    HIDDEN_SIZES = list(range(1024, 8129, 1024))
    ADD_RESIDUAL = [True, False]
    DTYPES = [torch.bfloat16, torch.float]
    GROUP_SIZES = [[1, 64], [1, 128]]
-    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
    bench_params = list(
-        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
    )
    return bench_params
@@ -58,7 +52,6 @@ def unfused_int8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -76,7 +69,6 @@ def unfused_fp8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -89,63 +81,23 @@ def unfused_fp8_impl(
    torch_out, _ = ops.scaled_fp8_quant(torch_out)
 def unfused_groupwise_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
    if residual is None:
        torch_out = rms_norm_layer.forward_cuda(x, residual)
    else:
        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
    # Quant
    torch_out, _ = per_token_group_quant_fp8(
        torch_out, group_size=group_size[1], use_ue8m0=False
    )
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
    )
 def fused_groupwise_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    out, _ = ops.rms_norm_per_block_quant(
        x,
        rms_norm_layer.weight,
        1e-6,
        quant_dtype,
        group_size,
        residual=residual,
        is_scale_transposed=True,
    )
 # Bench functions
 def bench_fn(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor,
    quant_dtype: torch.dtype,
    group_size: list[int],
    label: str,
    sub_label: str,
    fn: Callable,
@@ -158,11 +110,10 @@ def bench_fn(
        "x": x,
        "residual": residual,
        "quant_dtype": quant_dtype,
        "group_size": group_size,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@@ -196,7 +147,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
            params.group_size,
            label,
            sub_label,
            unfused_int8_impl,
@@ -211,7 +161,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            unfused_fp8_impl,
@@ -226,7 +175,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -241,7 +189,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -249,36 +196,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
        )
    )
    # unfused groupwise fp8 impl.
    timers.append(
        bench_fn(
            layer,
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            unfused_groupwise_fp8_impl,
            "unfused_groupwise_fp8_impl",
        )
    )
    # fused groupwise fp8 impl.
    timers.append(
        bench_fn(
            layer,
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            fused_groupwise_impl,
            "fused_groupwise_fp8_impl",
        )
    )
    print_timers(timers)
    return timers
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -1,244 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from enum import Enum
 from itertools import product
 from typing import Any
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _per_token_group_quant_fp8_colmajor,
    silu_mul_per_token_group_quant_fp8_colmajor,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 from .utils import ArgPool, Bench, CudaGraphBenchParams
 GROUP_SIZE = 128
 FLOAT8_T = torch.float8_e4m3fn
 def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
    print(
        f"Note : The timings reported above is for {cuda_graph_nops} "
        "consecutive invocations of the benchmarking functions. "
        f"Please divide by {cuda_graph_nops} for single invocation "
        "timings."
    )
    compare = TBenchmark.Compare(timers)
    compare.print()
 class ImplType(Enum):
    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
    REFERENCE = 2
    def get_impl(self):
        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
            return silu_mul_per_token_group_quant_fp8_colmajor
        elif self == ImplType.REFERENCE:
            return reference
        raise ValueError(f"Unrecognized ImplType {self}")
@dataclass
 class BenchmarkTensors:
    input: torch.Tensor
    output: torch.Tensor
    # Reference act output tensor
    ref_act_out: torch.Tensor
    ref_quant_out: torch.Tensor
    @staticmethod
    def make(T: int, N: int) -> "BenchmarkTensors":
        assert T % GROUP_SIZE == 0
        assert N % (GROUP_SIZE * 2) == 0
        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
        # silu_mul_per_token_group_quant_fp8_colmajor output.
        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
            FLOAT8_T
        )
        # reference output.
        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
        ref_quant_out = torch.empty(
            (T, N // 2), dtype=torch.bfloat16, device="cuda"
        ).to(FLOAT8_T)
        return BenchmarkTensors(
            input=input,
            output=output,
            ref_act_out=ref_act_out,
            ref_quant_out=ref_quant_out,
        )
    @property
    def T(self):
        return self.input.size(0)
    @property
    def N(self):
        return self.input.size(1)
    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
            return {
                "input": self.input,
                "output": self.output,
                "use_ue8m0": is_deep_gemm_e8m0_used(),
            }
        elif impl_type == ImplType.REFERENCE:
            return {
                "input": self.input,
                "act_out": self.ref_act_out,
                "quant_out": self.ref_quant_out,
                "use_ue8m0": is_deep_gemm_e8m0_used(),
            }
        raise ValueError(f"Unrecognized impl_type {impl_type}")
 def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
    """
    Reference triton quant kernel from,
    vllm.model_executor.layers.quantization.utils.fp8_utils
    """
    assert quant_out.size() == x.size()
    # Allocate the scale tensor column-major format.
    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
    x_q = quant_out
    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
    M = x.numel() // GROUP_SIZE
    N = GROUP_SIZE
    BLOCK = triton.next_power_of_2(N)
    # heuristics for number of warps
    num_warps = min(max(BLOCK // 256, 1), 8)
    num_stages = 1
    finfo = torch.finfo(FLOAT8_T)
    fp8_min = finfo.min
    fp8_max = finfo.max
    _per_token_group_quant_fp8_colmajor[(M,)](
        x,
        x_q,
        x_s,
        GROUP_SIZE,
        x.shape[1],
        x.stride(0),
        x_s.stride(1),
        eps=1e-10,
        fp8_min=fp8_min,
        fp8_max=fp8_max,
        use_ue8m0=use_ue8m0,
        BLOCK=BLOCK,
        num_warps=num_warps,
        num_stages=num_stages,
    )
    return x_q, x_s
 def reference(
    input: torch.Tensor,
    act_out: torch.Tensor,
    quant_out: torch.Tensor,
    use_ue8m0: bool,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    torch.ops._C.silu_and_mul(act_out, input)
    return reference_quant(act_out, quant_out, use_ue8m0)
 def bench_impl(
    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
 ) -> TMeasurement:
    T = bench_tensors[0].T
    N = bench_tensors[0].N
    arg_pool_size = len(bench_tensors)
    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
    torch.cuda.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
    for _kwargs in kwargs_list:
        for k, v in _kwargs.items():
            kwargs[k].values.append(v)
    cuda_graph_params = None
    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
    timer = None
    with Bench(
        cuda_graph_params,
        "silu-mul-quant",
        f"num_tokens={T}, N={N}",
        impl_type.name,
        impl_type.get_impl(),
        **kwargs,
    ) as bench:
        timer = bench.run()
    return timer
 def test_correctness(T: int, N: int):
    print(f"Testing num_tokens={T}, N={N} ...")
    bench_tensor = BenchmarkTensors.make(T, N)
    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
    # test ouptut
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
    torch.testing.assert_close(ref_out_s, out_s)
 def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
    timers = []
    for N, T in product(Ns, Ts):
        test_correctness(T, N)
        bench_tensors: list[BenchmarkTensors] = [
            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
        ]
        silu_mul_quant_timer = bench_impl(
            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
        )
        timers.append(silu_mul_quant_timer)
        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
        timers.append(reference_timer)
        print_timers(
            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
        )
    print_timers(timers, cuda_graph_nops=arg_pool_size)
    return timers
 if __name__ == "__main__":
    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
    N = [2048, 4096, 8192]
    print(f"T = {T}, N = {N}")
    run(T, N, arg_pool_size=8)
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -255,8 +255,8 @@ def bench_run(
        torch.cuda.synchronize()
        # Timing
-        start_event = torch.Event(enable_timing=True)
+        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
        latencies = []
        for _ in range(num_iters):
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,7 +237,6 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            b_q_weight=w_q,
            b_bias=None,
            b_scales=w_s,
            a_scales=None,
            global_scale=None,
            b_zeros=w_zp,
            g_idx=g_idx,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@@ -273,7 +273,7 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -1,150 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
 in MLA (Multi-head Latent Attention) prefill.
 This validates that the optimization from commit 8d4142bd is beneficial across
 various batch sizes, not just the originally tested batch size of 32768.
 """
 import time
 from collections.abc import Callable
 import torch
 # DeepSeek-V3 MLA dimensions
 NUM_HEADS = 128
 QK_NOPE_HEAD_DIM = 128
 PE_DIM = 64
 def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
    """Original torch.cat approach with expand."""
    return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
    """Optimized direct copy approach (avoids expand + cat overhead)."""
    k = torch.empty(
        (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
        dtype=k_nope.dtype,
        device=k_nope.device,
    )
    k[..., : k_nope.shape[-1]] = k_nope
    k[..., k_nope.shape[-1] :] = k_pe
    return k
 def benchmark_method(
    method: Callable,
    k_nope: torch.Tensor,
    k_pe: torch.Tensor,
    num_warmup: int = 10,
    num_iters: int = 100,
 ) -> float:
    """Benchmark a concatenation method and return mean latency in ms."""
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
    torch.cuda.synchronize()
    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
    torch.cuda.synchronize()
    end = time.perf_counter()
    return (end - start) / num_iters * 1000  # Convert to ms
@torch.inference_mode()
 def run_benchmark(dtype: torch.dtype, dtype_name: str):
    """Run benchmark for a specific dtype."""
    torch.set_default_device("cuda")
    # Batch sizes to test (powers of 2 from 32 to 65536)
    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
    print("=" * 80)
    print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
    print("=" * 80)
    print(
        f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
        f"k_pe=[B, 1, {PE_DIM}]"
    )
    print(f"dtype: {dtype_name}")
    print()
    print(
        f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
        f"{'Speedup':>8} | {'Reduction':>10}"
    )
    print("-" * 70)
    results = []
    for batch_size in batch_sizes:
        # Create input tensors (generate in float32 then convert for FP8 compatibility)
        k_nope = torch.randn(
            batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
        ).to(dtype)
        k_pe = torch.randn(
            batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
        ).to(dtype)
        # Benchmark both methods
        cat_time = benchmark_method(cat_method, k_nope, k_pe)
        direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
        speedup = cat_time / direct_time
        reduction = (1 - direct_time / cat_time) * 100
        results.append((batch_size, cat_time, direct_time, speedup, reduction))
        print(
            f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
            f"{speedup:>7.2f}x | {reduction:>9.1f}%"
        )
    print("=" * 80)
    # Summary statistics
    speedups = [r[3] for r in results]
    print("\nSpeedup summary:")
    print(f"  Min:  {min(speedups):.2f}x")
    print(f"  Max:  {max(speedups):.2f}x")
    print(f"  Mean: {sum(speedups) / len(speedups):.2f}x")
    # Find crossover point
    crossover_batch = None
    for batch_size, _, _, speedup, _ in results:
        if speedup >= 1.0:
            crossover_batch = batch_size
            break
    print("\nConclusion:")
    if crossover_batch:
        print(f"  - Direct copy becomes beneficial at batch size >= {crossover_batch}")
    # Filter for large batches (>= 512 which is typical for prefill)
    large_batch_speedups = [r[3] for r in results if r[0] >= 512]
    if large_batch_speedups:
        avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
        print(f"  - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
    print("  - MLA prefill typically uses large batches, so optimization is effective")
    return results
@torch.inference_mode()
 def main():
    # Test bfloat16
    print("\n")
    run_benchmark(torch.bfloat16, "bfloat16")
    # Test float8_e4m3fn
    print("\n")
    run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
 if __name__ == "__main__":
    main()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -185,8 +185,8 @@ def benchmark_config(
        graph.replay()
    torch.cuda.synchronize()
-    start_event = torch.Event(enable_timing=True)
+    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,15 +24,12 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
 topk_range = [1, 2, 8]
-ep_size_range = [1, 8]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
 configs = list(
    itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
 )
@triton.testing.perf_report(
    triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk", "ep_size"],
+        x_names=["num_tokens", "num_experts", "topk"],
        x_vals=configs,
        line_arg="provider",
        line_vals=["vllm"],
@@ -41,26 +38,16 @@ configs = list(
        args={},
    )
 )
-def benchmark(num_tokens, num_experts, topk, ep_size, provider):
+def benchmark(num_tokens, num_experts, topk, provider):
    """Benchmark function for Triton."""
    block_size = 256
    torch.cuda.manual_seed_all(0)
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
    e_map = None
    if ep_size != 1:
        local_e = num_experts // ep_size
        e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
        e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
    quantiles = [0.5, 0.2, 0.8]
    if provider == "vllm":
        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size(
+            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
                topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
            ),
            quantiles=quantiles,
        )
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -105,8 +105,8 @@ def benchmark_permute(
        graph.replay()
    torch.cuda.synchronize()
-    start_event = torch.Event(enable_timing=True)
+    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
@@ -241,8 +241,8 @@ def benchmark_unpermute(
        graph.replay()
    torch.cuda.synchronize()
-    start_event = torch.Event(enable_timing=True)
+    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
+# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,8 +86,9 @@ def benchmark_mrope(
    num_heads: int,
    num_kv_heads: int,
    max_position: int = 8192,
    rope_theta: float = 10000,
    is_neox_style: bool = True,
-    rope_parameters: dict[str, Any] | None = None,
+    rope_scaling: dict[str, Any] = None,
    dtype: torch.dtype = torch.bfloat16,
    seed: int = 0,
    warmup_iter: int = 10,
@@ -99,9 +100,11 @@ def benchmark_mrope(
    # the parameters to compute the q k v size based on tp_size
    mrope_helper_class = get_rope(
        head_size=head_dim,
        rotary_dim=head_dim,
        max_position=max_position,
        base=rope_theta,
        is_neox_style=is_neox_style,
-        rope_parameters=rope_parameters,
+        rope_scaling=rope_scaling,
        dtype=dtype,
    ).to(device=device)
@@ -200,8 +203,9 @@ def benchmark_mrope(
            num_kv_heads,
            head_dim,
            max_position,
            rope_theta,
            is_neox_style,
-            str(rope_parameters),
+            str(rope_scaling),
            str(dtype).split(".")[-1],
            torch_stats["mean"],
            torch_stats["median"],
@@ -251,8 +255,9 @@ if __name__ == "__main__":
            "num_kv_heads",
            "head_dim",
            "max_position",
            "rope_theta",
            "is_neox_style",
-            "rope_parameters",
+            "rope_scaling",
            "dtype",
            "torch_mean",
            "torch_median",
@@ -298,7 +303,7 @@ if __name__ == "__main__":
                q_size = num_heads * head_dim
                kv_size = num_kv_heads * head_dim
                is_neox_style = True
-                rope_parameters = config.rope_parameters
+                rope_theta = config.rope_theta
                max_position = config.max_position_embeddings
                for num_tokens in num_tokens_list:
@@ -310,8 +315,9 @@ if __name__ == "__main__":
                        num_heads=num_heads,
                        num_kv_heads=num_kv_heads,
                        max_position=max_position,
                        rope_theta=rope_theta,
                        is_neox_style=is_neox_style,
-                        rope_parameters=rope_parameters,
+                        rope_scaling=config.rope_scaling,
                        dtype=getattr(torch, args.dtype),
                        seed=args.seed,
                        warmup_iter=args.warmup_iter,
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -30,8 +30,8 @@ def _time_cuda(
        fn()
    torch.cuda.synchronize()
-    start = torch.Event(enable_timing=True)
+    start = torch.cuda.Event(enable_timing=True)
-    end = torch.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(bench_iters):
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
    def benchmark(batch_size, seq_len, num_heads, provider):
        dtype = torch.bfloat16
        max_position = 8192
-        rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+        base = 10000
-        rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
+        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
        rope = rope.to(dtype=dtype, device=device)
        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -253,8 +253,8 @@ def benchmark(
        )
    torch.cuda.synchronize()
-    start_event = torch.Event(enable_timing=True)
+    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
    # Benchmark
    latencies: list[float] = []
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -127,8 +127,8 @@ def benchmark_decode(
    def time_fn(fn, warmup=10, trials=20):
        torch.cuda.synchronize()
-        start = torch.Event(enable_timing=True)
+        start = torch.cuda.Event(enable_timing=True)
-        end = torch.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
        times = []
        for i in range(warmup):
            fn()
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -139,8 +139,8 @@ def benchmark_prefill(
    def time_fn(fn, warmup=10, trials=20):
        torch.cuda.synchronize()
-        start = torch.Event(enable_timing=True)
+        start = torch.cuda.Event(enable_timing=True)
-        end = torch.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
        times = []
        for i in range(warmup):
            fn()
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -183,8 +183,8 @@ def benchmark_config(
        run()
    torch.cuda.synchronize()
-    start_event = torch.Event(enable_timing=True)
+    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
-Currently, this just includes dense GEMMs and only works on Hopper GPUs.
+Currently this just includes dense GEMMs and only works on Hopper GPUs.
 ## Setup
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,10 +55,6 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```
 If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
 and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
 benchmark-only runtime so the reported throughput stays comparable).
 ### JSON configuration file for synthetic conversations generation
 The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -1076,7 +1076,6 @@ def process_statistics(
    verbose: bool,
    gen_conv_args: GenConvArgs | None = None,
    excel_output: bool = False,
    warmup_runtime_sec: float | None = None,
 ) -> None:
    if len(client_metrics) == 0:
        logger.info("No samples to process")
@@ -1170,13 +1169,8 @@ def process_statistics(
        # Convert milliseconds to seconds
        runtime_sec = runtime_sec / 1000.0
        requests_per_sec = float(len(df)) / runtime_sec
-        params = {
+
-            "runtime_sec": runtime_sec,
+        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
            "requests_per_sec": requests_per_sec,
        }
        if warmup_runtime_sec is not None:
            params["warmup_runtime_sec"] = warmup_runtime_sec
            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
        # Generate a summary of relevant metrics (and drop irrelevant data)
        df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1558,8 +1552,6 @@ async def main() -> None:
        url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
    )
    warmup_runtime_sec: float | None = None
    # Warm-up step
    if args.warmup_step:
        # Only send a single user prompt from every conversation.
@@ -1574,56 +1566,26 @@ async def main() -> None:
        # all clients should finish their work before exiting
        warmup_bench_args = bench_args._replace(early_stop=False)
-        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
        warmup_start_ns = time.perf_counter_ns()
        conversations, _ = await main_mp(
            warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
        )
-        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
        logger.info(
            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
            Color.PURPLE,
            warmup_runtime_sec,
            warmup_runtime_sec * 1000,
            Color.RESET,
        )
        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
    # Run the benchmark
-    benchmark_start_ns = time.perf_counter_ns()
+    start_time = time.perf_counter_ns()
    client_convs, client_metrics = await main_mp(
        client_args, req_args, bench_args, tokenizer, conversations
    )
-    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
+    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
    # Calculate requests per second
-    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    total_runtime_sec = total_runtime_ms / 1000.0
-    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
+    rps = len(client_metrics) / total_runtime_sec
    logger.info(
-        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
-        "requests per second: %.3f%s",
+        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
        Color.GREEN,
        benchmark_runtime_sec,
        benchmark_runtime_ms,
        requests_per_sec,
        Color.RESET,
    )
    if warmup_runtime_sec is not None:
        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
        logger.info(
            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
            Color.GREEN,
            warmup_runtime_sec,
            warmup_runtime_sec * 1000,
            Color.RESET,
        )
        logger.info(
            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
            Color.GREEN,
            total_runtime_sec,
            total_runtime_sec * 1000,
            Color.RESET,
        )
    # Benchmark parameters
    params = {
@@ -1648,7 +1610,6 @@ async def main() -> None:
        verbose=args.verbose,
        gen_conv_args=gen_conv_args,
        excel_output=args.excel_output,
        warmup_runtime_sec=warmup_runtime_sec,
    )
    if args.output_file is not None:
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,6 +251,17 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        # Build ACL with CMake
        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
        set(CMAKE_BUILD_TYPE "Release")
        set(ARM_COMPUTE_ARCH "armv8.2-a")
        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
        set(ARM_COMPUTE_BUILD_TESTING "OFF")
        set(_cmake_config_cmd
             ${CMAKE_COMMAND} -G Ninja -B build 
            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
@@ -364,7 +375,6 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
        set(VLLM_EXT_SRC
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,53 +0,0 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
 set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
 # be directly set to the triton_kernels python directory. 
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
  FetchContent_Declare(
          triton_kernels
          SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
  )
 else()
  set(TRITON_GIT "https://github.com/triton-lang/triton.git")
  message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
  FetchContent_Declare(
          triton_kernels
          # TODO (varun) : Fetch just the triton_kernels directory from Triton
          GIT_REPOSITORY https://github.com/triton-lang/triton.git
          GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
          GIT_PROGRESS TRUE
          SOURCE_SUBDIR python/triton_kernels/triton_kernels
  )
 endif()
 # Fetch content 
 FetchContent_MakeAvailable(triton_kernels)
 if (NOT triton_kernels_SOURCE_DIR)
  message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
 endif()
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
 else()
  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
 endif()
 message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
 add_custom_target(triton_kernels)
 # Ensure the vllm/third_party directory exists before installation
 install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
 ## Copy .py files to install directory.
 install(DIRECTORY
        ${TRITON_KERNELS_PYTHON_DIR}
        DESTINATION 
        vllm/third_party/triton_kernels/
        COMPONENT triton_kernels
        FILES_MATCHING PATTERN "*.py")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
+          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,21 +140,16 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
  run_python(_VLLM_TORCH_GOMP_PATH
    "
 import os, glob
-import torch
+try:
-torch_pkg = os.path.dirname(torch.__file__)
+  import torch
-site_root = os.path.dirname(torch_pkg)
+  torch_pkg = os.path.dirname(torch.__file__)
-
+  site_root = os.path.dirname(torch_pkg)
-# Search both torch.libs and torch/lib
+  torch_libs = os.path.join(site_root, 'torch.libs')
-roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-candidates = []
+except:
-for root in roots:
+  print('')
    if not os.path.isdir(root):
        continue
    candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
 print(candidates[0] if candidates else '')
 "
-    "failed to probe for libgomp")
+    "failed to probe torch.libs for libgomp")
  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
    return()
@@ -500,13 +495,7 @@ function (define_extension_target MOD_NAME)
    set(SOABI_KEYWORD "")
  endif()
-  run_python(IS_FREETHREADED_PYTHON
+  if (ARG_USE_SABI)
    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
    "Failed to determine whether interpreter is free-threaded")
  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
  # so avoid using the stable ABI under free-threading only.
  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
  else()
    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,8 +16,7 @@ __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size, const uint prefix_head_stride,
+    const uint head_size) {
    const uint output_head_stride) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
@@ -35,13 +34,11 @@ __global__ void merge_attn_states_kernel(
  const uint head_idx = token_head_idx % num_heads;
  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+  const uint head_offset =
-                               head_idx * prefix_head_stride;
+      token_idx * num_heads * head_size + head_idx * head_size;
-  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-                               head_idx * output_head_stride;
+  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + head_offset;
  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
  scalar_t* output_head_ptr = output + dst_head_offset;
  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -143,7 +140,7 @@ __global__ void merge_attn_states_kernel(
            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size, prefix_head_stride, output_head_stride);  \
+            num_heads, head_size);                                          \
  }
 /*@brief Merges the attention states from prefix and suffix
@@ -169,11 +166,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
  const uint prefix_head_stride = prefix_output.stride(1);
  const uint output_head_stride = output.stride(1);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
              "output heads must be contiguous in memory");
  TORCH_CHECK(
      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
      "prefix_output heads must be contiguous in memory");
  TORCH_CHECK(
      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
      "suffix_output heads must be contiguous in memory");
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,7 +1,6 @@
 #pragma once
 #include <torch/all.h>
 #include <c10/util/Optional.h>
 #include <map>
 #include <vector>
@@ -42,12 +41,11 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t batch_size, const std::string& kv_cache_dtype,
    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt);
@@ -59,15 +57,6 @@ void cp_gather_cache(
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
 // Gather and upconvert FP8 KV cache to BF16 workspace
 void cp_gather_and_upconvert_fp8_kv_cache(
    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
    torch::Tensor const& seq_lens,          // [BATCH]
    torch::Tensor const& workspace_starts,  // [BATCH]
    int64_t batch_size);
 // Indexer K quantization and cache function
 void indexer_k_quant_and_cache(
    torch::Tensor& k,             // [num_tokens, head_dim]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -2,7 +2,6 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/util/Optional.h>
 #include "cuda_utils.h"
 #include "cuda_compat.h"
@@ -515,8 +514,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
    const int quant_block_size,                // quantization block size
    const int cache_block_size,                // cache block size
    const int cache_stride,  // stride for each token in kv_cache
-
+    const bool use_ue8m0     // use ue8m0 scale format
    const bool use_ue8m0  // use ue8m0 scale format
 ) {
  constexpr int VEC_SIZE = 4;
  const int64_t token_idx = blockIdx.x;
@@ -554,11 +552,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
  __syncwarp();
 #endif
 #if defined(__gfx942__)
  float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else
  float scale = fmaxf(amax, 1e-4) / 448.0f;
 #endif
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
@@ -907,80 +901,87 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                               // ENTRIES...]
+                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t block_size, const int32_t entry_size,
    const int32_t num_tokens, const int32_t block_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const float* __restrict__ scale,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch
  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
  using ltype = vllm::vec_n_t<cache_t, vec_size>;
  using stype = vllm::vec_n_t<scalar_t, vec_size>;
  // We are adding this for code readability which will be optimized out when
  // build in release.
  assert(CTA_SIZE == blockDim.x);
-#pragma unroll
+  const int64_t bid = blockIdx.x;  // Batch ID
-  for (int token_id = blockIdx.x; token_id < num_tokens;
+  const int32_t num_splits = gridDim.y;
-       token_id += gridDim.x) {
+  const int32_t split = blockIdx.y;
-    int64_t batch_id = token_to_seq[token_id];
+  const int32_t seq_start = cu_seq_lens[bid];
-    int64_t batch_start = cu_seq_lens[batch_id];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
-    int64_t batch_end = cu_seq_lens[batch_id + 1];
+  const int32_t seq_len = seq_end - seq_start;
-    int32_t batch_offset = token_id - batch_start;
+  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
-    if (token_id >= batch_end) return;
+  const int32_t split_start = split * split_blocks;
-    int32_t offset = 0;
+  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
    if (seq_starts != nullptr) {
      offset = seq_starts[batch_id];
    }
    batch_offset += offset;
    int32_t block_table_id = batch_offset / block_size;
    int32_t slot_id = batch_offset % block_size;
    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
    int32_t block_id = block_table[block_table_offset];
    int64_t cache_offset =
        block_id * cache_block_stride + slot_id * cache_entry_stride;
    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
    scalar_t* dst_ = dst + token_id * dst_entry_stride;
    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
-#pragma unroll
+  const bool is_active_split = (split_start < tot_blocks);
-    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
+  const bool is_last_split = (split_end == tot_blocks);
  if (!is_active_split) return;
  int32_t full_blocks_end = split_end;
  int32_t partial_block_size = 0;
  // Adjust the pointer for the block_table for this batch.
  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
  // page_size)
  const int32_t batch_offset = bid * block_table_stride;
  int32_t offset = 0;
  if (seq_starts != nullptr) {
    offset = seq_starts[bid] / block_size;
  }
  const int32_t* batch_block_table = block_table + batch_offset + offset;
  // Adjust dst pointer based on the cumulative sequence lengths.
  dst += seq_start * dst_entry_stride;
  if (is_last_split) {
    partial_block_size = seq_len % block_size;
    if (partial_block_size) full_blocks_end -= 1;
  }
  auto copy_entry = [&](const cache_t* __restrict__ _src,
                        scalar_t* __restrict__ _dst) {
    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        reinterpret_cast<stype*>(dst_)[idx] =
+        _dst[i] = static_cast<scalar_t>(_src[i]);
            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
      } else {
-        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        _dst[i] =
-        stype store_val;
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
 #pragma unroll
        for (int j = 0; j < vec_size; ++j) {
          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
              loaded_val.val[j], *scale);
        }
        reinterpret_cast<stype*>(dst_)[idx] = store_val;
      }
    }
-    // process tail
+  };
-    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+
-    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+  for (int pid = split_start; pid < full_blocks_end; ++pid) {
-    src_ = src_ + ENTRY_SIZE - tail_cnt;
+    auto block_id = batch_block_table[pid];
-#pragma unroll
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+    for (int eid = 0; eid < block_size; ++eid) {
-        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      copy_entry(block_start_ptr + eid * cache_entry_stride,
-      } else {
+                 block_dst_ptr + eid * dst_entry_stride);
-        dst_[idx] =
+    }
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
+  }
-      }
+
  if (partial_block_size) {
    auto block_id = batch_block_table[full_blocks_end];
    auto block_start_ptr = src_cache + block_id * cache_block_stride;
    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
    for (int eid = 0; eid < partial_block_size; ++eid) {
      copy_entry(block_start_ptr + eid * cache_entry_stride,
                 block_dst_ptr + eid * dst_entry_stride);
    }
  }
 }
@@ -991,38 +992,34 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                         \
-      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          cache_entry_stride, dst_entry_stride,                             \
-          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
          seq_starts_ptr);
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
 //  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t batch_size, const std::string& kv_cache_dtype,
    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t block_size = src_cache.size(1);
-  int32_t head_dim = dst.size(-1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
@@ -1032,9 +1029,6 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
  TORCH_CHECK(head_dim == 576,
              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
              "for better performance")
  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@@ -1052,9 +1046,10 @@ void gather_and_maybe_dequant_cache(
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);
-  constexpr int32_t thread_block_size = 64;
+  // Decide on the number of splits based on the batch size.
-  dim3 grid(num_tokens);
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 block(thread_block_size);
+  dim3 grid(batch_size, num_splits);
  dim3 block(1024);
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
@@ -1063,82 +1058,6 @@ void gather_and_maybe_dequant_cache(
 }
 namespace vllm {
 // Gather and upconvert FP8 KV cache tokens to BF16 workspace
 // Similar to cp_gather_cache but specifically for FP8->BF16 conversion
 __global__ void cp_gather_and_upconvert_fp8_kv_cache(
    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
    const int32_t* __restrict__ seq_lens,     // [BATCH]
    const int32_t* __restrict__ workspace_starts,  // [BATCH]
    const int32_t block_size, const int32_t head_dim,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
  const int64_t bid = blockIdx.x;  // Batch ID
  const int32_t num_splits = gridDim.y;
  const int32_t split = blockIdx.y;
  const int32_t seq_start = workspace_starts[bid];
  const int32_t seq_len = seq_lens[bid];
  const int32_t tot_slots = seq_len;
  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
  const int32_t split_start = split * split_slots;
  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
  const bool is_active_split = (split_start < tot_slots);
  if (!is_active_split) return;
  // Adjust the pointer for the block_table for this batch
  const int32_t batch_offset = bid * block_table_stride;
  int32_t offset = split_start;
  int32_t offset_div = offset / block_size;
  offset = offset % block_size;
  const int32_t* batch_block_table = block_table + batch_offset;
  // Adjust dst pointer based on the cumulative sequence lengths
  dst += seq_start * dst_entry_stride;
  const int tid = threadIdx.x;
  // Process each token in this split
  for (int pid = split_start; pid < split_end; ++pid) {
    auto block_id = batch_block_table[offset_div];
    const uint8_t* token_ptr =
        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
    const uint8_t* no_pe_ptr = token_ptr;
    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
    const __nv_bfloat16* rope_ptr =
        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
    if (tid < 512) {
      // FP8 dequantization
      const int tile = tid >> 7;  // each tile is 128 elements
      const float scale = scales_ptr[tile];
      const uint8_t val = no_pe_ptr[tid];
      dst_ptr[tid] =
          fp8::scaled_convert<__nv_bfloat16, uint8_t,
                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
    } else if (tid < 576) {
      // Rope copy (64 bf16 elements)
      const int rope_idx = tid - 512;
      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
    }
    // Move to next token
    offset += 1;
    if (offset == block_size) {
      offset_div += 1;
      offset = 0;
    }
  }
 }
 template <typename scalar_t>
 // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
 // block_size.
@@ -1280,57 +1199,6 @@ void cp_gather_cache(
  }
 }
 void cp_gather_and_upconvert_fp8_kv_cache(
    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
    torch::Tensor const& seq_lens,          // [BATCH]
    torch::Tensor const& workspace_starts,  // [BATCH]
    int64_t batch_size) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t block_size = src_cache.size(1);
  int32_t head_dim = dst.size(1);
  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
  TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
  TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
              "workspace_starts must be int32");
  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
  TORCH_CHECK(src_cache.device() == block_table.device(),
              "src_cache and block_table must be on the same device");
  TORCH_CHECK(src_cache.device() == seq_lens.device(),
              "src_cache and seq_lens must be on the same device");
  TORCH_CHECK(src_cache.device() == workspace_starts.device(),
              "src_cache and workspace_starts must be on the same device");
  TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
  TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
  TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
  int64_t block_table_stride = block_table.stride(0);
  int64_t cache_block_stride = src_cache.stride(0);
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);
  // Decide on the number of splits based on the batch size
  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
  dim3 grid(batch_size, num_splits);
  dim3 block(576);
  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
      src_cache.data_ptr<uint8_t>(),
      reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
      block_table_stride, cache_block_stride, cache_entry_stride,
      dst_entry_stride);
 }
 // Macro to dispatch the kernel based on the data type.
 #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lucas Wilkinson	275de34170	[BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> (cherry picked from commit `8f4f77a727`)	2025-11-19 14:11:21 -08:00
Julien Denize	fa3ffb4365	[BugFix] Ray with multiple nodes (#28873 ) Signed-off-by: Julien Denize <julien.denize@mistral.ai> (cherry picked from commit `cdeec2e606`)	2025-11-19 14:11:08 -08:00
Lucas Wilkinson	6d5974369c	[BugFix] Fix async-scheduling + FlashAttn MLA (#28990 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> (cherry picked from commit `48fc8b1e59`)	2025-11-19 14:10:50 -08:00
Johnny	0ce9990d2c	[NVIDIA] Guard SM100 CUTLASS MoE macro to SM100 builds v2 (#28938 ) Signed-off-by: johnnynunez <johnnynuca14@gmail.com> Signed-off-by: Johnny <johnnynuca14@gmail.com> (cherry picked from commit `49ef847aa8`)	2025-11-19 14:10:37 -08:00
		`@@ -1 +0,0 @@`
			`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`