[Bugfix] Improve engine ready timeout error message (#35616 )

Signed-off-by: damaozi <1811866786@qq.com>
[BugFix] Support tool_choice=none in the Anthropic API (#35835 )
2026-03-04 05:54:32 +00:00 · 2026-03-04 05:24:46 +00:00 · 2026-03-04 04:50:13 +00:00 · 2026-03-04 04:12:51 +00:00 · 2026-03-03 19:53:53 -08:00 · 2026-03-03 19:50:58 -08:00
1887 changed files with 177959 additions and 52404 deletions
--- a/tests/compile/distributed/init.py
+++ b/tests/compile/distributed/init.py
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,7 +1,8 @@
 name: vllm_ci
 job_dirs:
  - ".buildkite/test_areas"
  - ".buildkite/image_build"
  - ".buildkite/test_areas"
  - ".buildkite/hardware_tests"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -0,0 +1,30 @@
 group: Hardware - AMD Build 
 steps:
  - label: "AMD: :docker: build image"
    key: image-build-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
    commands:
    - >
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
      --target test
      --no-cache
      --progress plain .
    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 1
        - exit_status: -10  # Agent was lost
          limit: 1
        - exit_status: 1  # Machine occasionally fail
          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -0,0 +1,10 @@
 group: Hardware
 depends_on: ~
 steps:
  - label: "Ascend NPU Test"
    soft_fail: true
    timeout_in_minutes: 20
    no_plugin: true
    device: ascend_npu
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
 group: CPU
 depends_on: []
 steps:
 - label: CPU-Kernel Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - cmake/cpu_extension.cmake
  - CMakeLists.txt
  - vllm/_custom_ops.py
  - tests/kernels/attention/test_cpu_attn.py
  - tests/kernels/moe/test_cpu_fused_moe.py
  - tests/kernels/test_onednn.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - vllm/
  - tests/models/language/generation/
  - tests/models/language/pooling/
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
      pytest -x -v -s tests/models/language/generation -m cpu_model
      pytest -x -v -s tests/models/language/pooling -m cpu_model"
 - label: CPU-Quantization Model Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - vllm/model_executor/layers/quantization/cpu_wna16.py
  - vllm/model_executor/layers/quantization/gptq_marlin.py
  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
  - tests/quantization/test_compressed_tensors.py
  - tests/quantization/test_cpu_wna16.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
 - label: CPU-Distributed Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/shm.cpp
  - vllm/v1/worker/cpu_worker.py
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/worker/cpu_model_runner.py
  - vllm/v1/worker/gpu_model_runner.py
  - vllm/platforms/cpu.py
  - vllm/distributed/parallel_state.py
  - vllm/distributed/device_communicators/cpu_communicator.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
 - label: CPU-Multi-Modal Model Tests %N
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  # - vllm/
  - vllm/model_executor/layers/rotary_embedding
  - tests/models/multimodal/generation/
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
  parallelism: 2
 - label: "Arm CPU Test"
  depends_on: []
  soft_fail: true
  device: arm_cpu
  no_plugin: true
  commands: 
  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -0,0 +1,10 @@
 group: Hardware
 steps:
  - label: "GH200 Test"
    soft_fail: true
    device: gh200
    no_plugin: true
    optional: true
    commands: 
    - nvidia-smi 
    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -0,0 +1,17 @@
 group: Hardware
 depends_on: ~
 steps:
  - label: "Intel HPU Test"
    soft_fail: true
    device: intel_hpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
  - label: "Intel GPU Test"
    depends_on: []
    soft_fail: true
    device: intel_gpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,56 +1,255 @@
 #!/bin/bash
-set -e
+set -euo pipefail
-if [[ $# -lt 8 ]]; then
+# replace invalid characters in Docker image tags and truncate to 128 chars
-  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+clean_docker_tag() {
-  exit 1
+    local input="$1"
    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
 }
 print_usage_and_exit() {
    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
    exit 1
 }
 print_instance_info() {
    echo ""
    echo "=== Debug: Instance Information ==="
    # Get IMDSv2 token
    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
        echo "AMI ID:        ${AMI_ID}"
        echo "Instance Type: ${INSTANCE_TYPE}"
        echo "Instance ID:   ${INSTANCE_ID}"
        echo "AZ:            ${AZ}"
    else
        echo "Not running on EC2 or IMDS not available"
    fi
    # Check for warm cache AMI (marker file baked into custom AMI)
    if [[ -f /etc/vllm-ami-info ]]; then
        echo "Cache:         warm (custom vLLM AMI)"
        cat /etc/vllm-ami-info
    else
        echo "Cache:         cold (standard AMI)"
    fi
    echo "==================================="
    echo ""
 }
 setup_buildx_builder() {
    echo "--- :buildkite: Setting up buildx builder"
    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
        # Custom AMI with standalone buildkitd - use remote driver for warm cache
        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
        echo "Using remote driver to connect to buildkitd (warm cache available)"
        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
            echo "Using existing baked-vllm-builder"
            docker buildx use baked-vllm-builder
        else
            echo "Creating baked-vllm-builder with remote driver"
            docker buildx create \
                --name baked-vllm-builder \
                --driver remote \
                --use \
                "unix://${BUILDKIT_SOCKET}"
        fi
        docker buildx inspect --bootstrap
    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
        # Existing builder available
        echo "Using existing builder: ${BUILDER_NAME}"
        docker buildx use "${BUILDER_NAME}"
        docker buildx inspect --bootstrap
    else
        # No local buildkitd, no existing builder - create new docker-container builder
        echo "No local buildkitd found, using docker-container driver"
        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
        docker buildx inspect --bootstrap
    fi
    # builder info
    echo "Active builder:"
    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
 }
 check_and_skip_if_image_exists() {
    if [[ -n "${IMAGE_TAG:-}" ]]; then
        echo "--- :mag: Checking if image exists"
        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
            echo "Image already exists: ${IMAGE_TAG}"
            echo "Skipping build"
            exit 0
        fi
        echo "Image not found, proceeding with build"
    fi
 }
 ecr_login() {
    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
 }
 prepare_cache_tags() {
    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
            cache="${MAIN_CACHE_ECR}:latest"
        else
            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
            cache="${TEST_CACHE_ECR}:${clean_branch}"
        fi
        CACHE_TO="$cache"
        CACHE_FROM="$cache"
        CACHE_FROM_BASE_BRANCH="$cache"
    else
        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
        else
            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
        fi
    fi
    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
 }
 resolve_parent_commit() {
    if [[ -z "${PARENT_COMMIT:-}" ]]; then
        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
        if [[ -n "${PARENT_COMMIT}" ]]; then
            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
            export PARENT_COMMIT
        else
            echo "Could not determine parent commit (may be first commit in repo)"
        fi
    else
        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
    fi
 }
 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
    # Write to a temp directory to avoid polluting the repo root (which is the
    # Docker build context). Files left in the repo root get COPY'd into the
    # image and can cause duplicate artifact uploads from downstream steps.
    local bake_tmp
    bake_tmp="$(mktemp -d)"
    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
 }
 #################################
 #         Main Script           #
 #################################
 print_instance_info
 if [[ $# -lt 5 ]]; then
    print_usage_and_exit
 fi
 # input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-VLLM_USE_PRECOMPILED=$5
+IMAGE_TAG=$5
-VLLM_MERGE_BASE_COMMIT=$6
+IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
 CACHE_FROM=$7
 CACHE_TO=$8
-# authenticate with AWS ECR
+# build config
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+TARGET="test-ci"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
 BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
 CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
 CI_HCL_PATH="/tmp/ci.hcl"
 BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
-# docker buildx 
+prepare_cache_tags
-docker buildx create --name vllm-builder --driver docker-container --use
+ecr_login
 docker buildx inspect --bootstrap
 docker buildx ls
-# skip build if image already exists
+# Environment info (for docs and human readers)
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-  echo "Image not found, proceeding with build..."
+#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-else
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
-  echo "Image found"
+#
-  exit 0
+# Build configuration (exported as environment variables for bake):
 export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
 # print args
 echo "--- :mag: Arguments"
 echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
 # print build configuration
 echo "--- :mag: Build configuration"
 echo "TARGET: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
 echo "BUILDER_NAME: ${BUILDER_NAME}"
 echo "CI_HCL_URL: ${CI_HCL_URL}"
 echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
 echo "--- :mag: Cache tags"
 echo "CACHE_TO: ${CACHE_TO}"
 echo "CACHE_FROM: ${CACHE_FROM}"
 echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
 echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
 check_and_skip_if_image_exists
 echo "--- :docker: Setting up Docker buildx bake"
 echo "Target: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
 echo "CI HCL path: ${CI_HCL_PATH}"
 if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
    echo "Make sure you're running from the vLLM repository root"
    exit 1
 fi
-if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+echo "--- :arrow_down: Downloading ci.hcl"
-  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-else
+echo "Downloaded to ${CI_HCL_PATH}"
-  merge_base_commit_build_args=""
+
 if [[ ! -f "${CI_HCL_PATH}" ]]; then
    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
    exit 1
 fi
-# build
+setup_buildx_builder
-docker buildx build --file docker/Dockerfile \
+
-  --build-arg max_jobs=16 \
+resolve_parent_commit
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+export PARENT_COMMIT
-  --build-arg USE_SCCACHE=1 \
+
-  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+print_bake_config
-  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+
-  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+echo "--- :docker: Building ${TARGET}"
-  ${merge_base_commit_build_args} \
+docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+
-  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+echo "--- :white_check_mark: Build complete"
  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
  --push \
  --target test \
  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,8 +3,9 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
    timeout_in_minutes: 600
    commands:
-    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -40,7 +41,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-  
+
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -0,0 +1,15 @@
 model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.695
  - name: "exact_match,flexible-extract"
    value: 0.447
 limit: 1319
 num_fewshot: 5
 max_model_len: 262144
 enforce_eager: false
 apply_chat_template: true
 fewshot_as_multiturn: true
 trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -0,0 +1,19 @@
 model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.7142
  - name: "exact_match,flexible-extract"
    value: 0.4579
 env_vars:
  VLLM_USE_FLASHINFER_MOE_FP8: "1"
  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
 limit: 1319
 num_fewshot: 5
 max_model_len: 262144
 kv_cache_dtype: fp8
 enforce_eager: false
 apply_chat_template: true
 fewshot_as_multiturn: true
 trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1 +1,2 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
 NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
 NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit $LIMIT
+  --limit "$LIMIT"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -20,14 +20,11 @@ usage() {
    echo
 }
-while getopts "m:b:l:f:t:" OPT; do
+while getopts "m:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,8 +9,10 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path
 import pandas as pd
 import regex as re
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
 # -----------------------------
 # Export helpers (Excel + CSV)
 # -----------------------------
 def _sanitize_sheet_name(name: str) -> str:
    """
    Excel sheet constraints:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
    """
    name = "sheet" if name is None else str(name)
    name = re.sub(r"[:\\/?*\[\]]", "_", name)
    name = name.strip().strip("'")
    name = re.sub(r"\s+", " ", name)
    if not name:
        name = "sheet"
    return name[:31]
 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
    model = d.get("Model", "model")
    model_short = str(model).split("/")[-1]
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
    return _sanitize_sheet_name(f"{model_short}{lens}")
 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
    startrow = 0
    for title, df in blocks:
        pd.DataFrame([[title]]).to_excel(
            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
        )
        startrow += 1
        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
        startrow += len(df) + 3
 def _safe_filename(s: str) -> str:
    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
    return s[:180] if len(s) > 180 else s
 # -----------------------------
 # vLLM environment export helper
 # -----------------------------
 def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
    """Parse vllm_env.txt into a flat table (Section, Key, Value).
    Supports:
      - section headers as standalone lines (no ':' or '=')
      - key-value lines like 'OS: Ubuntu ...'
      - env var lines like 'HF_HOME=/data/hf'
    """
    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
    section = "General"
    rows: list[dict] = []
    def set_section(s: str):
        nonlocal section
        s = (s or "").strip()
        if s:
            section = s
    for raw in lines:
        stripped = raw.strip()
        if not stripped:
            continue
        # divider lines like =====
        if set(stripped) <= {"="}:
            continue
        # section header heuristic: short standalone line
        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
            if stripped.lower().startswith("collecting environment information"):
                continue
            set_section(stripped)
            continue
        # env var style: KEY=VALUE (and not a URL with :)
        if "=" in stripped and ":" not in stripped:
            k, v = stripped.split("=", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
        # key: value
        if ":" in stripped:
            k, v = stripped.split(":", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
 def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
    """Load vllm_env.txt next to the *original* input JSON file.
    Note: when only one -f is provided, the script may split JSON into ./splits/...,
    but vllm_env.txt typically lives next to the original benchmark_results.json.
    """
    base_dir: Path | None = None
    if getattr(args, "file", None):
        base_dir = Path(args.file[0]).resolve().parent
    elif files:
        base_dir = Path(files[0]).resolve().parent
    if base_dir is None:
        return None
    env_path = base_dir / "vllm_env.txt"
    if not env_path.exists():
        return None
    df = _parse_vllm_env_txt(env_path)
    return df
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
    summary_df = pd.DataFrame(rows)
    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
    both_col = f"Max {conc_col} (Both)"
    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
    styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
 def build_valid_max_concurrency_summary_df(
    tput_group_df: pd.DataFrame | None,
    ttft_group_df: pd.DataFrame | None,
    tpot_group_df: pd.DataFrame | None,
    conc_col: str,
    args,
 ) -> pd.DataFrame | None:
    if ttft_group_df is None and tpot_group_df is None:
        return None
    ttft_cols = (
        _config_value_columns(ttft_group_df, conc_col)
        if ttft_group_df is not None
        else []
    )
    tpot_cols = (
        _config_value_columns(tpot_group_df, conc_col)
        if tpot_group_df is not None
        else []
    )
    tput_cols = (
        _config_value_columns(tput_group_df, conc_col)
        if tput_group_df is not None
        else []
    )
    if ttft_group_df is not None and tpot_group_df is not None:
        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
        if tput_group_df is not None:
            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
    else:
        cfg_cols = ttft_cols or tpot_cols
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
            if tpot_group_df is not None
            else pd.NA
        )
        both = (
            pd.NA
            if (pd.isna(ttft_max) or pd.isna(tpot_max))
            else min(ttft_max, tpot_max)
        )
        tput_at_both = (
            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
            if tput_group_df is not None
            else pd.NA
        )
        ttft_at_both = (
            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_at_both = (
            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
            if tpot_group_df is not None
            else pd.NA
        )
        rows.append(
            {
                "Configuration": cfg,
                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
                "TPOT @ Both (ms)": tpot_at_both,
            }
        )
    df = pd.DataFrame(rows)
    for c in df.columns:
        if c != "Configuration":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
    # ---- NEW: export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
        default="perf_comparison.xlsx",
        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
    )
    parser.add_argument(
        "--csv-out-dir",
        type=str,
        default="",
        help="If set, write per-group per-metric CSVs into this directory.",
    )
    return parser
@@ -657,7 +885,6 @@ def maybe_write_plot(
        markers=True,
    )
    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")
@@ -730,87 +957,151 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }
-    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
-        main_fh.write('<meta charset="utf-8">\n')
+    if csv_dir:
-        for gkey in group_keys:
+        csv_dir.mkdir(parents=True, exist_ok=True)
            gkey_tuple = normalize_group_key(gkey)
            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
            sub_path = group_filename(gkey_tuple)
            group_header = (
                '<div style="font-size: 1.4em; font-weight: 700; '
                'margin: 18px 0 10px 0;">'
                f"{_html.escape(suffix)}"
                "</div>\n"
            )
-            main_fh.write(group_header)
+    excel_path = args.excel_out or "perf_comparison.xlsx"
-            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
-                sub_fh.write('<meta charset="utf-8">\n')
+        # ---- Environment sheet (first) ----
-                sub_fh.write(group_header)
+        env_sheet = _sanitize_sheet_name("Environment")
-                tput_group_df = None
+        env_df = _load_env_df_for_inputs(args, files)
-                ttft_group_df = None
+        if env_df is None or env_df.empty:
-                tpot_group_df = None
+            pd.DataFrame(
-                conc_col = args.xaxis
+                [
                    {
                        "Section": "Environment",
                        "Key": "vllm_env.txt",
                        "Value": "NOT FOUND (or empty)",
                    }
                ]
            ).to_excel(xw, sheet_name=env_sheet, index=False)
        else:
            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
                gkey_tuple = normalize_group_key(gkey)
                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
                sub_path = group_filename(gkey_tuple)
                group_header = (
                    '<div style="font-size: 1.4em; font-weight: 700; '
                    'margin: 18px 0 10px 0;">'
                    f"{_html.escape(suffix)}"
                    "</div>\n"
                )
-                for metric_label in plan.data_cols:
+                main_fh.write(group_header)
                    gb = metric_groupbys[metric_label]
                    df_sorted, raw_data_cols = metric_cache[metric_label]
-                    try:
+                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
-                        group_df = gb.get_group(gkey)
+                sheet_base = sheet
-                    except KeyError:
+                dedup_i = 1
-                        missing = (
+                while sheet in xw.sheets:
-                            '<div style="font-size: 1.1em; font-weight: 600; '
+                    dedup_i += 1
-                            'margin: 10px 0;">'
+                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
-                            f"{_html.escape(metric_label)} — missing for this group"
+
-                            "</div>\n"
+                excel_blocks: list[tuple[str, pd.DataFrame]] = []
                with open(sub_path, "w", encoding="utf-8") as sub_fh:
                    sub_fh.write('<meta charset="utf-8">\n')
                    sub_fh.write(group_header)
                    tput_group_df = None
                    ttft_group_df = None
                    tpot_group_df = None
                    conc_col = args.xaxis
                    for metric_label in plan.data_cols:
                        gb = metric_groupbys[metric_label]
                        df_sorted, raw_data_cols = metric_cache[metric_label]
                        try:
                            group_df = gb.get_group(gkey)
                        except KeyError:
                            missing = (
                                '<div style="font-size: 1.1em; font-weight: 600; '
                                'margin: 10px 0;">'
                                f"{_html.escape(metric_label)} — missing for this group"
                                "</div>\n"
                            )
                            main_fh.write(missing)
                            sub_fh.write(missing)
                            continue
                        if conc_col not in group_df.columns:
                            conc_col = _find_concurrency_col(group_df)
                        mn = metric_label.lower().strip()
                        if "tok/s" in mn:
                            tput_group_df = group_df
                        elif "ttft" in mn:
                            ttft_group_df = group_df
                        elif mn in ("p99", "median") or "tpot" in mn:
                            tpot_group_df = group_df
                        display_group = group_df.drop(
                            columns=group_cols_canonical, errors="ignore"
                        )
-                        main_fh.write(missing)
+                        html = render_metric_table_html(
-                        sub_fh.write(missing)
+                            display_group, metric_label, suffix, args
-                        continue
+                        )
                        main_fh.write(html)
                        sub_fh.write(html)
-                    if conc_col not in group_df.columns:
+                        maybe_write_plot(
-                        conc_col = _find_concurrency_col(group_df)
+                            main_fh,
                            sub_fh,
                            group_df=group_df,
                            raw_data_cols=raw_data_cols,
                            metric_label=metric_label,
                            y_axis_col=y_axis_col,
                            args=args,
                        )
-                    mn = metric_label.lower().strip()
+                        excel_blocks.append(
-                    if "tok/s" in mn:
+                            (metric_label, display_group.reset_index(drop=True))
-                        tput_group_df = group_df
+                        )
-                    elif "ttft" in mn:
+                        if csv_dir:
-                        ttft_group_df = group_df
+                            fn = _safe_filename(
-                    elif mn in ("p99", "median") or "tpot" in mn:
+                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
-                        tpot_group_df = group_df
+                                    "/", "_"
                                )
                            )
                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
-                    display_group = group_df.drop(
+                    summary_html = build_valid_max_concurrency_summary_html(
-                        columns=group_cols_canonical, errors="ignore"
+                        tput_group_df=tput_group_df,
-                    )
+                        ttft_group_df=ttft_group_df,
-
+                        tpot_group_df=tpot_group_df,
-                    html = render_metric_table_html(
+                        conc_col=conc_col,
                        display_group, metric_label, suffix, args
                    )
                    main_fh.write(html)
                    sub_fh.write(html)
                    maybe_write_plot(
                        main_fh,
                        sub_fh,
                        group_df=group_df,
                        raw_data_cols=raw_data_cols,
                        metric_label=metric_label,
                        y_axis_col=y_axis_col,
                        args=args,
                    )
                    if summary_html:
                        main_fh.write(summary_html)
                        sub_fh.write(summary_html)
-                summary_html = build_valid_max_concurrency_summary_html(
+                    summary_df = build_valid_max_concurrency_summary_df(
-                    tput_group_df=tput_group_df,
+                        tput_group_df=tput_group_df,
-                    ttft_group_df=ttft_group_df,
+                        ttft_group_df=ttft_group_df,
-                    tpot_group_df=tpot_group_df,
+                        tpot_group_df=tpot_group_df,
-                    conc_col=conc_col,
+                        conc_col=conc_col,
-                    args=args,
+                        args=args,
-                )
+                    )
-                if summary_html:
+                    if summary_df is not None:
-                    main_fh.write(summary_html)
+                        excel_blocks.append(
-                    sub_fh.write(summary_html)
+                            ("Valid Max Concurrency Summary", summary_df)
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__Valid_Max_Concurrency_Summary"
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
    print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")
 def main():
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            + "performance-benchmarks-descriptions.md"
+            "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 # This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -9,14 +7,19 @@
 set -x
 set -o pipefail
 # Environment-driven debug controls (like ON_CPU=1)
 DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
+    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
  fi
  if [[ $gpu_count -gt 0 ]]; then
@@ -25,9 +28,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  
+
  declare -g arch_suffix=''
-  
+
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -44,7 +47,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo $numa_count
+    echo "$numa_count"
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -112,13 +115,12 @@ json2envs() {
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
+    until curl -sf http://localhost:8000/v1/models >/dev/null; do
      sleep 1
-    done' && return 0 || return 1
+    done
  '
 }
 kill_processes_launched_by_current_bash() {
@@ -181,19 +183,20 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
-run_latency_tests() {
+run_benchmark_tests() {
-  # run latency tests using `vllm bench latency` command
+  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: a json file specifying latency test cases
+  # $1: test type (latency or throughput)
  # $2: a json file specifying test cases
-  local latency_test_file
+  local test_type=$1
-  latency_test_file=$1
+  local test_file=$2
-  # Iterate over latency tests
+  # Iterate over tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
+  jq -c '.[]' "$test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
+    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
+      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
      exit 1
    fi
@@ -204,15 +207,15 @@ run_latency_tests() {
    fi
    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
+    bench_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
+    bench_args=$(json2args "$bench_params")
-    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    latency_envs=$(json2envs "$latency_environment_variables")
+    bench_envs=$(json2envs "$bench_environment_variables")
    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -225,118 +228,42 @@ run_latency_tests() {
      fi
    fi
-    latency_command=" $latency_envs vllm bench latency \
+    bench_command=" $bench_envs vllm bench $test_type \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
+      $bench_args"
    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
+    echo "${test_type^} command: $bench_command"
-    # recoding benchmarking command ang GPU command
+    # recording benchmarking command and GPU command
    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
+      --arg command "$bench_command" \
      --arg gpu "$gpu_type" \
      --arg test_type "$test_type" \
      '{
-        latency_command: $latency,
+        ($test_type + "_command"): $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
-    eval "$latency_command"
+    eval "$bench_command"
    kill_gpu_processes
  done
 }
-run_throughput_tests() {
+run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-  # run throughput tests using `vllm bench throughput`
+run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-  # $1: a json file specifying throughput test cases
+run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
-  local throughput_test_file
+merge_serving_tests_stream() {
-  throughput_test_file=$1
+  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-
+  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-  # Iterate over throughput tests
+  local serving_test_file="$1"
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+  # shellcheck disable=SC2016
-    # get the test name, and append the GPU type back to it.
+  local merged='
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    throughput_envs=$(json2envs "$throughput_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    throughput_command=" $throughput_envs vllm bench throughput \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg command "$throughput_command" \
      --arg gpu "$gpu_type" \
      '{
        throughput_command: $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$throughput_command"
    kill_gpu_processes
  done
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -358,7 +285,50 @@ run_serving_tests() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  ' "$serving_test_file" | while read -r params; do
+  '
  jq -c "$merged" "$serving_test_file" | \
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
      select((($model|length)==0)
             or ((.server_parameters.model // "") == $model)
             or ((.client_parameters.model // "") == $model))
      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
    '
  else
    cat
  fi
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # In dry-run mode, if filters are provided but no tests match, fail fast.
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    local count
    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
    if [[ "$count" -eq 0 ]]; then
      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
      return 0
    fi
  fi
  # Iterate over serving tests (merged + optional filtered stream)
  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -427,7 +397,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" ]]; then
+    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -438,6 +408,9 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
        # dry-run: don't start server
        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -447,34 +420,39 @@ run_serving_tests() {
      fi
    fi
    # save the compilation mode and optimization level on the serving results
    # whenever they are set
    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
+        # pass the tensor parallel size, the compilation mode, and the optimization
-        # on the benchmark dashboard
+        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
+          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
          $client_args $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
-        bash -c "$client_command"
+        if [[ "${DRY_RUN:-0}" != "1" ]]; then
          bash -c "$client_command"
        fi
        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -492,12 +470,15 @@ run_serving_tests() {
    done
    # clean up
-    kill -9 $server_pid
+    if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    kill_gpu_processes
+      kill -9 "$server_pid"
      kill_gpu_processes
    fi
  done
 }
 main() {
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -507,7 +488,13 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-  check_hf_token
+
  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    check_hf_token
  else
    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -528,12 +515,18 @@ main() {
  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
  if [[ "${DRY_RUN:-0}" == "1" ]]; then
    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
    exit 0
  fi
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
  # postprocess benchmarking results
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,5 +51,56 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "latency_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "max-model-len": 2048,
            "dtype": "bfloat16"
        }
    },
    {
        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "max-model-len": 512,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "gpu-memory-utilization": 0.95,
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "latency_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen3-8B",
            "tensor_parallel_size": 1,
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "dtype": "bfloat16",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -0,0 +1,41 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [
      32,
      64,
      128
    ],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "jinaai/jina-embeddings-v3",
      "trust_remote_code": ""
    },
    "client_parameters": {
      "model": "jinaai/jina-embeddings-v3",
      "backend": "openai-embeddings",
      "endpoint": "/v1/embeddings",
      "dataset_name": "sharegpt",
      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -0,0 +1,283 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "tensor_parallel_size": 1,
      "dtype": "bfloat16",
      "distributed_executor_backend": "mp",
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
      "max_num_batched_tokens": 2048,
      "max_num_seqs": 256
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "backend": "vllm",
      "ignore-eos": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_llama8B_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp2_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,136 +148,6 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -78,5 +78,84 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_deepseek_r1",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
            "max-num-seqs": 200,
            "async-scheduling": "",
            "dtype": "bfloat16"
        },
        "client_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "disable_log_stats": "",
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "enable_expert_parallel": "",
            "max-num-batched-tokens": 4096
        },
        "client_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_qwen3_8b",
        "qps_list": [1, 4, 10, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "dtype": "bfloat16",
            "disable_log_stats": "",
            "async-scheduling": ""
        },
        "client_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,5 +57,67 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 384,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 512,
            "async-scheduling": "",
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "throughput_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "max-num-seqs": 512,
            "backend": "vllm",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -176,23 +176,6 @@ steps:
        env:
          DOCKER_BUILDKIT: "1"
      - block: "Build release image for x86_64 ROCm"
        key: block-rocm-release-image-build
        depends_on: ~
      - label: "Build release image - x86_64 - ROCm"
        depends_on: block-rocm-release-image-build
        id: build-release-image-rocm
        agents:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          # Build base image first
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
          # Build vLLM ROCm image using the base
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
  - group: "Publish release images"
    key: "publish-release-images"
    steps:
@@ -274,14 +257,14 @@ steps:
          - input-release-version
          - build-wheels
-      - label: "Upload release wheels to PyPI and GitHub"
+      - label: "Upload release wheels to PyPI"
        depends_on:
          - block-upload-release-wheels
        id: upload-release-wheels
        agents:
          queue: small_cpu_queue_postmerge
        commands:
-          - "bash .buildkite/scripts/upload-release-wheels.sh"
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
@@ -476,7 +459,7 @@ steps:
      S3_BUCKET: "vllm-wheels"
  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel"
+  - label: ":python: Build vLLM ROCm Wheel - x86_64"
    id: build-rocm-vllm-wheel
    depends_on:
      - step: build-rocm-base-wheels
@@ -638,9 +621,93 @@ steps:
    depends_on:
      - step: upload-rocm-wheels
        allow_failure: true
      - step: input-release-version
        allow_failure: true
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-rocm-release.sh"
    env:
      S3_BUCKET: "vllm-wheels"
  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
  - block: "Generate Root Index for ROCm Wheels for Release"
    key: block-generate-root-index-rocm-wheels
    depends_on: upload-rocm-wheels
  - label: ":package: Generate Root Index for ROCm Wheels for Release"
    depends_on: block-generate-root-index-rocm-wheels
    id: generate-root-index-rocm-wheels
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
    env:
      S3_BUCKET: "vllm-wheels"
      VARIANT: "rocm700"
  # ROCm Job 5: Build ROCm Release Docker Image
  - label: ":docker: Build release image - x86_64 - ROCm"
    id: build-rocm-release-image
    depends_on:
      - step: build-rocm-base-wheels
        allow_failure: false
    agents:
      queue: cpu_queue_postmerge
    timeout_in_minutes: 60
    commands:
      - |
        set -euo pipefail
        # Login to ECR
        aws ecr-public get-login-password --region us-east-1 | \
          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
        # Download Docker image from S3 (set by build-rocm-base-wheels)
        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
          echo "ERROR: rocm-docker-image-s3-path metadata not found"
          exit 1
        fi
        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
        mkdir -p artifacts/rocm-docker-image
        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
        # Load base Docker image
        echo "Loading base Docker image..."
        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
        echo "Loaded base image: $${BASE_IMAGE_TAG}"
        # Tag and push the base image to ECR
        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
        # Get GPU architectures from meta-data
        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
        # Build vLLM ROCm release image using cached base
        DOCKER_BUILDKIT=1 docker build \
          --build-arg max_jobs=16 \
          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
          --build-arg USE_SCCACHE=1 \
          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
          --build-arg SCCACHE_REGION_NAME=us-west-2 \
          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
          --target vllm-openai \
          --progress plain \
          -f docker/Dockerfile.rocm .
        # Push to ECR
        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
    env:
      DOCKER_BUILDKIT: "1"
      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,28 +11,36 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+(Optional) For CUDA 13.0:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
 (Optional) For CPU:
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
 To download the wheel (by version):
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 To download and upload the image:
 \`\`\`
 # Download images:
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
 docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
 # Tag and push images:
 ## CUDA
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -40,22 +48,70 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
 docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
 docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
 docker push vllm/vllm-openai:latest-x86_64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
-docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker push vllm/vllm-openai:latest-rocm
+docker push vllm/vllm-openai:latest-aarch64-cu130
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 ## ROCm
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker push vllm/vllm-openai-rocm:latest
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 ## CPU
 docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
 docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
 docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai-cpu:latest-x86_64
 docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
 docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
 docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
 docker push vllm/vllm-openai-cpu:latest-arm64
 docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
 # Create multi-arch manifest:
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 docker manifest rm vllm/vllm-openai:latest-cu130
 docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 docker manifest rm vllm/vllm-openai-cpu:latest || true
 docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
 docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
 docker manifest push vllm/vllm-openai-cpu:latest
 docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -3,25 +3,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Generate Buildkite annotation for ROCm wheel release
 set -ex
 # Get build configuration from meta-data
 # Extract ROCm version dynamically from Dockerfile.rocm_base
-# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
 ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
 PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
 PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 # TODO: Enable the nightly build for ROCm
 # Get release version, default to 1.0.0.dev for nightly/per-commit builds
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
 if [ -z "${RELEASE_VERSION}" ]; then
  RELEASE_VERSION="1.0.0.dev"
 fi
 # S3 URLs
 S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
 S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
-S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
+S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
 ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
-## :rocm: ROCm Wheel Release
+## ROCm Wheel and Docker Image Releases
 ### Build Configuration
 | Setting | Value |
 |---------|-------|
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
 ### :package: Installation
 **Install from this build (by commit):**
 \`\`\`bash
 uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
-# Example:
+\`\`\`bash
-uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
+pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
 # Example for ROCm ${ROCM_VERSION}:
 pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
 \`\`\`
 **Install from nightly (if published):**
 \`\`\`bash
-uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
+pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
 \`\`\`
 ### :floppy_disk: Download Wheels Directly
 \`\`\`bash
 # List all ROCm wheels
-aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
+aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
 # Download specific wheels
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`
 ### :gear: Included Packages
 - **vllm**: vLLM with ROCm support
 - **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton_rocm**: Triton built for ROCm
+- **triton**: Triton
 - **triton-kernels**: Triton kernels
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
 - **amd_aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm
 ### :warning: Notes
 - These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
 - Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
 - Platform: Linux x86_64 only
 ### :package: Docker Image Release
 To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker push vllm/vllm-openai-rocm:latest
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 \`\`\`
 EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi
-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
        echo ""
        echo "Downloaded wheels:"
-        ls -lh artifacts/rocm-base-wheels/
+        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -0,0 +1,205 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Check if Ray LLM can generate lock files that are compatible with this
 # version of vllm. Downloads Ray's requirement files and runs a full
 # dependency resolution with the installed vllm's constraints to see if
 # a valid lock file can be produced.
 #
 # See: https://github.com/vllm-project/vllm/issues/33599
 set -eo pipefail
 RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
 WORK_DIR=$(mktemp -d)
 trap 'rm -rf "$WORK_DIR"' EXIT
 # Fetch all Ray requirement files used in the LLM depset pipeline
 echo ">>> Fetching Ray requirement files"
 RAY_FILES=(
    "requirements.txt"
    "requirements/cloud-requirements.txt"
    "requirements/base-test-requirements.txt"
    "requirements/llm/llm-requirements.txt"
    "requirements/llm/llm-test-requirements.txt"
 )
 for FILE in "${RAY_FILES[@]}"; do
    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
    echo "    ${FILE}"
    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
 done
 # Extract installed vllm deps
 echo ">>> Extracting installed vllm dependency constraints"
 python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
 """Write out the installed vllm's dependencies as pip constraint lines.
 Ray uses vllm[audio], so audio-extra deps are included with their extra
 markers stripped. The resolver cannot evaluate extra markers for a
 package that is not itself being resolved from an index, so we activate
 them manually here.
 """
 import importlib.metadata
 import re
 import sys
 out_path = sys.argv[1]
 raw_reqs = importlib.metadata.requires("vllm") or []
 # Ray uses vllm[audio] – activate that extra.
 ACTIVE_EXTRAS = {"audio"}
 EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
 lines = []
 for r in raw_reqs:
    if ";" not in r:
        # Unconditional dep — always include.
        lines.append(r.strip())
        continue
    req_part, _, marker_part = r.partition(";")
    marker_part = marker_part.strip()
    extra_matches = EXTRA_RE.findall(marker_part)
    if not extra_matches:
        # Non-extra marker (python_version, etc.) — keep as-is.
        lines.append(r.strip())
        continue
    if not ACTIVE_EXTRAS.intersection(extra_matches):
        continue  # Skip inactive extras (tensorizer, bench, …).
    # Strip the extra== conditions but keep any remaining markers
    # (e.g. python_version).
    cleaned = EXTRA_RE.sub("", marker_part)
    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
    if cleaned:
        lines.append(f"{req_part.strip()} ; {cleaned}")
    else:
        lines.append(req_part.strip())
 with open(out_path, "w") as f:
    for line in lines:
        f.write(line + "\n")
 print(f"Wrote {len(lines)} constraints to {out_path}")
 PYEOF
 echo ">>> Installed vllm deps (first 20 lines):"
 head -20 "${WORK_DIR}/vllm-constraints.txt"
 # Remove Ray's vllm pin — the installed vllm's transitive deps
 # (written above) replace it in the resolution. vllm itself cannot
 # be resolved from PyPI for in-development versions, so we test
 # whether Ray's requirements can coexist with vllm's dependency
 # constraints instead.
 sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
 # Install uv if needed
 if ! command -v uv &>/dev/null; then
    echo ">>> Installing uv"
    pip install uv -q
 fi
 # Resolve: given vllm's constraints, can Ray compile a lock file?
 #
 # vllm's dependency constraints are the fixed side — Ray is flexible and
 # can regenerate its lock files. We pass vllm's constraints via -c so
 # the resolver treats them as non-negotiable bounds, then check whether
 # Ray's own requirements can still be satisfied within those bounds.
 echo ""
 echo "============================================================"
 echo ">>> Resolving: Can Ray generate compatible lock files?"
 echo "============================================================"
 set +e
 uv pip compile \
    "${WORK_DIR}/requirements.txt" \
    "${WORK_DIR}/cloud-requirements.txt" \
    "${WORK_DIR}/base-test-requirements.txt" \
    "${WORK_DIR}/llm-requirements.txt" \
    "${WORK_DIR}/llm-test-requirements.txt" \
    -c "${WORK_DIR}/vllm-constraints.txt" \
    --python-version 3.12 \
    --python-platform x86_64-manylinux_2_31 \
    --extra-index-url https://download.pytorch.org/whl/cu129 \
    --index-strategy unsafe-best-match \
    --unsafe-package setuptools \
    --unsafe-package ray \
    --no-header \
    -o "${WORK_DIR}/resolved.txt" \
    2>&1
 EXIT_CODE=$?
 set -e
 echo ""
 echo "=========================================="
 if [ $EXIT_CODE -eq 0 ]; then
    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
    echo ""
    echo "Key resolved versions:"
    grep -E '^(protobuf|torch|numpy|transformers)==' \
        "${WORK_DIR}/resolved.txt" | sort || true
    echo "=========================================="
    exit 0
 fi
 echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
 echo "This means a fundamental dependency conflict exists that Ray"
 echo "cannot resolve by regenerating its lock files."
 echo "See: https://github.com/vllm-project/vllm/issues/33599"
 echo "=========================================="
 # Buildkite annotation
 if [ -f /usr/bin/buildkite-agent ]; then
    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
 ### :warning: Ray Dependency Compatibility Warning
 This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
 Ray would not be able to regenerate its lock files to accommodate this vllm version.
 Please check the **Ray Dependency Compatibility Check** step logs for details.
 See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
 EOF
 fi
 # Notify Slack if webhook is configured.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
    echo ">>> Sending Slack notification"
    # Single quotes are intentional: the f-string expressions are Python, not shell.
    # shellcheck disable=SC2016
    PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
 url = os.getenv("BUILDKITE_BUILD_URL", "#")
 data = {
    "text": ":warning: Ray Dependency Compatibility Check Failed",
    "blocks": [{
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": (
                "*:warning: Ray Dependency Compatibility Check Failed*\n"
                f"PR #{pr} on branch `{branch}` introduces dependencies "
                f"that cannot be resolved with Ray'\''s requirements.\n"
                f"<{url}|View Build>"
            ),
        },
    }],
 }
 print(json.dumps(data))
 ')
    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
        -H 'Content-type: application/json' \
        -d "$PAYLOAD")
    echo "    Slack webhook response: $HTTP_CODE"
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
 exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap "rm -f $PR_DATA" EXIT
+trap 'rm -f "$PR_DATA"' EXIT
 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant sub-directory.
+    Generate project list HTML content linking to each project & variant subdirectory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.
    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
    The index for the default variant (if any) is generated in the root index directory.
    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.
-    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.
    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant sub-directory
+            cpu/ # cpu variant subdirectory
                index.html
                vllm/
                    index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant sub-directory
+            cu130/ # cu130 variant subdirectory
                index.html
                vllm/
                    index.html
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,25 +1,57 @@
 #!/bin/bash
-# This script runs test inside the corresponding ROCm docker container.
+# This script runs tests inside the corresponding ROCm docker container.
 # It handles both single-node and multi-node test configurations.
 #
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
 #
 ###############################################################################
 # QUOTING / COMMAND PASSING
 #
 # Passing commands as positional arguments ($*) is fragile when the command
 # string itself contains double quotes, e.g.:
 #
 #   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
 #
 # The outer shell resolves the nested quotes *before* this script runs, so
 # the script receives mangled input it cannot fully recover.
 #
 # Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
 #
 #   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
 #   bash run-amd-test.sh
 #
 # Single-quoted assignment preserves all inner double quotes verbatim.
 # The $* path is kept for backward compatibility but callers should migrate.
 ###############################################################################
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
-# Print ROCm version
+###############################################################################
-echo "--- Confirming Clean Initial State"
+# Helper Functions
-while true; do
+###############################################################################
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
-echo "--- ROCm info"
+wait_for_clean_gpus() {
-rocminfo
+  local timeout=${1:-300}
  local start=$SECONDS
  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
  while true; do
    if grep -q clean /opt/amdgpu/etc/gpu_state; then
      echo "GPUs state is \"clean\""
      return
    fi
    if (( SECONDS - start >= timeout )); then
      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
      exit 1
    fi
    sleep 3
  done
 }
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -28,15 +60,12 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-  # Check disk usage of the filesystem where Docker's root directory is located
+
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -44,201 +73,432 @@ cleanup_docker() {
  fi
 }
-# Call the cleanup docker function
+cleanup_network() {
  local max_nodes=${NUM_NODES:-2}
  for node in $(seq 0 $((max_nodes - 1))); do
    if docker ps -a -q -f name="node${node}" | grep -q .; then
      docker stop "node${node}" || true
    fi
  done
  if docker network ls | grep -q docker-net; then
    docker network rm docker-net || true
  fi
 }
 is_multi_node() {
  local cmds="$1"
  # Primary signal: NUM_NODES environment variable set by the pipeline
  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
    return 0
  fi
  # Fallback: detect the bracket syntax structurally
  # Pattern: [...] && [...] (per-node command arrays)
  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
    return 0
  fi
  return 1
 }
 handle_pytest_exit() {
  local exit_code=$1
  if [ "$exit_code" -eq 5 ]; then
    echo "Pytest exit code 5 (no tests collected) - treating as success."
    exit 0
  fi
  exit "$exit_code"
 }
 ###############################################################################
 # Pytest marker/keyword re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
 # quotes around multi-word pytest -m/-k expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
 #
 # This function detects unquoted expressions after -m/-k and re-quotes them
 # by collecting tokens until a recognizable boundary is reached:
 #   - test path (contains '/')
 #   - test file (ends with '.py')
 #   - another pytest flag (--xxx or -x single-char flags)
 #   - command separator (&& || ; |)
 #   - environment variable assignment (FOO=bar)
 #
 # Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
 # unquoted since they have no spaces and work fine.
 #
 # Already-quoted expressions (containing literal single quotes) are passed
 # through untouched to avoid double-quoting values injected by
 # apply_rocm_test_overrides.
 #
 # NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
 # double-quotes stripped by the calling shell (see header comment).
 # Use VLLM_TEST_COMMANDS to avoid the problem entirely.
 ###############################################################################
 re_quote_pytest_markers() {
  local input="$1"
  local output=""
  local collecting=false
  local marker_buf=""
  # Strip backslash-newline continuations, then flatten remaining newlines
  local flat="${input//$'\\\n'/ }"
  flat="${flat//$'\n'/ }"
  # Disable globbing to prevent *.py etc. from expanding during read -ra
  local restore_glob
  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
  set -o noglob
  local -a words
  read -ra words <<< "$flat"
  eval "$restore_glob"
  for word in "${words[@]}"; do
    if $collecting; then
      # If the token we're about to collect already contains a literal
      # single quote, the expression was already quoted upstream.
      # Flush and stop collecting.
      if [[ "$word" == *"'"* ]]; then
        if [[ -n "$marker_buf" ]]; then
          # Should not normally happen (partial buf + quote), flush raw
          output+="${marker_buf} "
          marker_buf=""
        fi
        output+="${word} "
        collecting=false
        continue
      fi
      local is_boundary=false
      case "$word" in
        # Line-continuation artifact
        "\\")
          is_boundary=true ;;
        # Command separators
        "&&"|"||"|";"|"|")
          is_boundary=true ;;
        # Long flags (--ignore, --shard-id, etc.)
        --*)
          is_boundary=true ;;
        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
        # like "not" which don't start with "-". Also skip -k/-m which
        # would start a new marker (handled below).
        -[a-zA-Z])
          is_boundary=true ;;
        # Test path (contains /)
        */*)
          is_boundary=true ;;
        # Test file (ends with .py, possibly with ::method)
        *.py|*.py::*)
          is_boundary=true ;;
        # Environment variable assignment preceding a command (FOO=bar)
        *=*)
          # Only treat as boundary if it looks like VAR=value, not
          # pytest filter expressions like num_gpus=2 inside markers
          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
            is_boundary=true
          fi
          ;;
      esac
      if $is_boundary; then
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
        else
          output+="${marker_buf} "
        fi
        collecting=false
        marker_buf=""
        # Check if this boundary word itself starts a new -m/-k
        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
          output+="${word} "
          collecting=true
        # Drop stray backslash tokens silently
        elif [[ "$word" == "\\" ]]; then
          :
        else
          output+="${word} "
        fi
      else
        # Accumulate into marker buffer
        if [[ -n "$marker_buf" ]]; then
          marker_buf+=" ${word}"
        else
          marker_buf="${word}"
        fi
      fi
    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
      output+="${word} "
      collecting=true
      marker_buf=""
    else
      output+="${word} "
    fi
  done
  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
      output+="${marker_buf}"
    fi
  fi
  echo "${output% }"
 }
 ###############################################################################
 # ROCm-specific pytest command rewrites
 #
 # These apply ignore flags and environment overrides for tests that are not
 # yet supported or behave differently on ROCm hardware. Kept as a single
 # function so new exclusions are easy to add in one place.
 ###############################################################################
 apply_rocm_test_overrides() {
  local cmds="$1"
  # --- Model registry filter ---
  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
  fi
  # --- LoRA: disable custom paged attention ---
  if [[ $cmds == *"pytest -v -s lora"* ]]; then
    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
  fi
  # --- Kernel ignores ---
  if [[ $cmds == *" kernels/core"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/core/test_fused_quant_layernorm.py \
    --ignore=kernels/core/test_permute_cols.py"
  fi
  if [[ $cmds == *" kernels/attention"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/attention/test_attention_selector.py \
    --ignore=kernels/attention/test_encoder_decoder_attn.py \
    --ignore=kernels/attention/test_flash_attn.py \
    --ignore=kernels/attention/test_flashinfer.py \
    --ignore=kernels/attention/test_prefix_prefill.py \
    --ignore=kernels/attention/test_cascade_flash_attn.py \
    --ignore=kernels/attention/test_mha_attn.py \
    --ignore=kernels/attention/test_lightning_attn.py \
    --ignore=kernels/attention/test_attention.py"
  fi
  if [[ $cmds == *" kernels/quantization"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/quantization/test_int8_quant.py \
    --ignore=kernels/quantization/test_machete_mm.py \
    --ignore=kernels/quantization/test_block_fp8.py \
    --ignore=kernels/quantization/test_block_int8.py \
    --ignore=kernels/quantization/test_marlin_gemm.py \
    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
    --ignore=kernels/quantization/test_int8_kernel.py"
  fi
  if [[ $cmds == *" kernels/mamba"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/mamba/test_mamba_mixer2.py \
    --ignore=kernels/mamba/test_causal_conv1d.py \
    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
  fi
  if [[ $cmds == *" kernels/moe"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/moe/test_moe.py \
    --ignore=kernels/moe/test_cutlass_moe.py \
    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
  fi
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
    --ignore=entrypoints/openai/test_audio.py \
    --ignore=entrypoints/openai/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
    --ignore=entrypoints/openai/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
    --ignore=entrypoints/openai/test_prompt_validation.py "}
  fi
  if [[ $cmds == *" entrypoints/llm "* ]]; then
    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
    --ignore=entrypoints/llm/test_chat.py \
    --ignore=entrypoints/llm/test_accuracy.py \
    --ignore=entrypoints/llm/test_init.py \
    --ignore=entrypoints/llm/test_prompt_validation.py "}
  fi
  # Clean up escaped newlines from --ignore appends
  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
  echo "$cmds"
 }
 ###############################################################################
 # Main
 ###############################################################################
 # --- GPU initialization ---
 echo "--- Confirming Clean Initial State"
 wait_for_clean_gpus
 echo "--- ROCm info"
 rocminfo
 # --- Docker housekeeping ---
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 wait_for_clean_gpus
-while true; do
+# --- Pull test image ---
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
-   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 # --- Prepare commands ---
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
-commands=$@
+# ---- Command source selection ----
-echo "Commands:$commands"
+# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
-
+# Fall back to $* for backward compatibility, but warn that inner
-commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+# double-quotes will have been stripped by the calling shell.
-
+if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
-if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands="${VLLM_TEST_COMMANDS}"
-  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
 else
  commands="$*"
  if [[ -z "$commands" ]]; then
    echo "Error: No test commands provided." >&2
    echo "Usage:" >&2
    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
    echo "  Legacy:     bash $0 \"commands here\"" >&2
    exit 1
  fi
  echo "Commands sourced from positional args (legacy mode)"
  echo "WARNING: Inner double-quotes in the command string may have been"
  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
  echo "  export VLLM_TEST_COMMANDS='your commands here'"
  echo "  bash $0"
 fi
-commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
+echo "Raw commands: $commands"
-if [[ $commands == *"pytest -v -s lora"* ]]; then
+# Fix quoting before ROCm overrides (so overrides see correct structure)
-  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+commands=$(re_quote_pytest_markers "$commands")
-fi
+echo "After re-quoting: $commands"
-#ignore certain kernels tests
+commands=$(apply_rocm_test_overrides "$commands")
-if [[ $commands == *" kernels/core"* ]]; then
+echo "Final commands: $commands"
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
-# Test that we're launching on the machine that has
+# Verify GPU access
 # proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+# --- RDMA device passthrough (conditional) ---
-if [[ $commands == *"--shard-id="* ]]; then
+# If the host has RDMA devices, pass them through so tests like
-  # assign job count as the number of shards used
+# test_moriio_connector can access ibverbs. On hosts without RDMA
-  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+# hardware the tests will gracefully skip via _rdma_available().
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+RDMA_FLAGS=""
-    # assign shard-id for each shard
+if [ -d /dev/infiniband ]; then
-    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+  echo "RDMA devices detected on host, enabling passthrough"
-    echo "Shard ${GPU} commands:$commands_gpu"
+  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+else
-    docker run \
+  echo "No RDMA devices found on host, RDMA tests will be skipped"
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+fi
-        --network=host \
+
-        --shm-size=16gb \
+# --- Route: multi-node vs single-node ---
-        --group-add "$render_gid" \
+if is_multi_node "$commands"; then
-        --rm \
+  echo "--- Multi-node job detected"
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
+  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-        -e HF_TOKEN \
+
-        -e AWS_ACCESS_KEY_ID \
+  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-        -e AWS_SECRET_ACCESS_KEY \
+  #   BASH_REMATCH[1] = prefix (everything before first bracket)
-        -v "${HF_CACHE}:${HF_MOUNT}" \
+  #   BASH_REMATCH[2] = comma-separated node0 commands
-        -e "HF_HOME=${HF_MOUNT}" \
+  #   BASH_REMATCH[3] = comma-separated node1 commands
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
+  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
-        --name "${container_name}_${GPU}" \
+    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-        "${image_name}" \
+    echo "PREFIX: ${prefix}"
-        /bin/bash -c "${commands_gpu}" \
+
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    export composite_command="(command rocm-smi || true)"
-    PIDS+=($!)
+    saved_IFS=$IFS
-  done
+    IFS=','
-  #wait for all processes to finish and collect exit codes
+    read -ra node0 <<< "${BASH_REMATCH[2]}"
-  for pid in "${PIDS[@]}"; do
+    read -ra node1 <<< "${BASH_REMATCH[3]}"
-    wait "${pid}"
+    IFS=$saved_IFS
-    STATUS+=($?)
+
-  done
+    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
-  at_least_one_shard_with_tests=0
+      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
    elif [[ ${st} -eq 5 ]]; then
      echo "Shard exited with status 5 (no tests collected) - treating as success"
    else # This means st is 0
      at_least_one_shard_with_tests=1
    fi
-  done
+
-  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+    for i in "${!node0[@]}"; do
-    echo "All shards reported no tests collected. Failing the build."
+      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
-    exit 1
+      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
      echo "COMMANDS: ${step_cmd}"
      composite_command="${composite_command} && ${step_cmd}"
    done
    /bin/bash -c "${composite_command}"
    exit_code=$?
    cleanup_network
    handle_pytest_exit "$exit_code"
  else
    echo "Multi-node job detected but failed to parse bracket command syntax."
    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
    echo "Got: $commands"
    cleanup_network
    exit 111
  fi
 else
  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network=host \
+    $RDMA_FLAGS \
-          --shm-size=16gb \
+    --network=host \
-          --group-add "$render_gid" \
+    --shm-size=16gb \
-          --rm \
+    --group-add "$render_gid" \
-          -e HF_TOKEN \
+    --rm \
-          -e AWS_ACCESS_KEY_ID \
+    -e HF_TOKEN \
-          -e AWS_SECRET_ACCESS_KEY \
+    -e AWS_ACCESS_KEY_ID \
-          -v "${HF_CACHE}:${HF_MOUNT}" \
+    -e AWS_SECRET_ACCESS_KEY \
-          -e "HF_HOME=${HF_MOUNT}" \
+    -v "${HF_CACHE}:${HF_MOUNT}" \
-          -e "PYTHONPATH=${MYPYTHONPATH}" \
+    -e "HF_HOME=${HF_MOUNT}" \
-          --name "${container_name}" \
+    -e "PYTHONPATH=${MYPYTHONPATH}" \
-          "${image_name}" \
+    --name "${container_name}" \
-          /bin/bash -c "${commands}"
+    "${image_name}" \
    /bin/bash -c "${commands}"
  exit_code=$?
  handle_pytest_exit "$exit_code"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,43 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_CI_ENV=0
 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
 timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename tp_pp.json \
    --save-result \
    --endpoint /v1/completions
 kill -s SIGTERM $server_pid; wait $server_pid || true
 failed_req=$(jq '.failed' ./test_results/tp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
 timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename dp_pp.json \
    --save-result \
    --endpoint /v1/completions
 kill -s SIGTERM $server_pid; wait $server_pid || true
 failed_req=$(jq '.failed' ./test_results/dp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
 }
 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 # used for TP/PP E2E test
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 IMAGE_NAME="cpu-test-$NUMA_NODE"
 TIMEOUT_VAL=$1
 TEST_COMMAND=$2
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+# building the docker image
-
+echo "--- :docker: Building Docker image"
-# Setup cleanup
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 remove_docker_container() {
    set -e;
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  # list packages
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    pip list"
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pip list"
  # offline inference
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
    pytest -x -v -s tests/models/language/generation -m cpu_model
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
  # Run AWQ/GPTQ test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/quantization/test_cpu_wna16.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/lora/test_qwenvl.py"
  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
  # online serving: tp+dp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,21 +1,49 @@
 #!/bin/bash
-# This script build the CPU docker image and run the offline inference inside the container.
+# This script builds the HPU docker image and runs the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 #
 # vllm-gaudi compatibility pinning:
 #   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
 #   When upstream vllm changes its API, the plugin may break before it has been updated.
 #   To handle this, the vllm-gaudi repository maintains a file:
 #     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
 #   The first line of that file controls what version of vllm is used inside the Docker image:
 #     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
 #     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
 #                         the test to a known-compatible baseline.
 #   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail
 # Fetch the vllm community commit reference from vllm-gaudi (first line only).
 VLLM_COMMUNITY_COMMIT=$(curl -s \
  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
  | head -1 | tr -d '\n')
 echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
 cat <<EOF | docker build -t "${image_name}" -f - .
 FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
 # If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
 # to the version known to be compatible with vllm-gaudi. When the value is "latest",
 # the current checkout (the Buildkite CI commit) is used unchanged.
 RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
    fi
 WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
 RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
 # install development dependencies (for testing)
@@ -36,15 +64,20 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_containers() { docker rm -f "${container_name}" || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers
 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  hpu-plugin-v1-test-env \
+  -e VLLM_SKIP_WARMUP=true \
-  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
 '
 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,6 +41,7 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -48,9 +49,8 @@ get_config() {
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
+if ! get_config; then
  exit 1
 fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
+mkdir -p "${builder_cache_dir}"
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t ${image_name} -f - .
+    --progress=plain --load -t "${image_name}" -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
+#   returns one argument per line: --device, /dev/davinciX, ...
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
+        printf '%s\n' "--device"
        printf '%s\n' "/dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
+mkdir -p "${model_cache_dir}"
 docker run \
-    ${devices} \
+    "${device_args[@]}" \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -182,7 +177,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
+    -v "${model_cache_dir}":/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t "${image_name}" -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() {
@@ -38,15 +38,18 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
 # tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
 # push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:$TAG_NAME-aarch64
+docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
 # push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
+docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -1,64 +0,0 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Setup script for Prime-RL integration tests
 # This script prepares the environment for running Prime-RL tests with nightly vLLM
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
    exit 0
 fi
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
 if [ -d "${PRIME_RL_DIR}" ]; then
    echo "Removing existing Prime-RL directory..."
    rm -rf "${PRIME_RL_DIR}"
 fi
 # Install UV if not available
 if ! command -v uv &> /dev/null; then
    echo "Installing UV package manager..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    source $HOME/.local/bin/env
 fi
 # Clone Prime-RL repository at specific branch for reproducible tests
 PRIME_RL_BRANCH="integ-vllm-main"
 echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 cd "${PRIME_RL_DIR}"
 echo "Setting up UV project environment..."
 export UV_PROJECT_ENVIRONMENT=/usr/local
 ln -s /usr/bin/python3 /usr/local/bin/python
 # Remove vllm pin from pyproject.toml
 echo "Removing vllm pin from pyproject.toml..."
 sed -i '/vllm==/d' pyproject.toml
 # Sync Prime-RL dependencies
 echo "Installing Prime-RL dependencies..."
 uv sync --inexact && uv sync --inexact --all-extras
 # Verify installation
 echo "Verifying installations..."
 uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 echo "Prime-RL integration test environment setup complete!"
 echo "Running Prime-RL integration tests..."
 export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 uv run pytest -vs tests/integration/test_rl.py -m gpu
 echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,7 +43,6 @@ trap cleanup EXIT
 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -52,13 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port $PORT &
+    --all2all-backend "$BACK" \
    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -0,0 +1,57 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # Nightly e2e test for prefetch offloading with a MoE model.
 # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
 # and validates GSM8K accuracy matches baseline (no offloading).
 #
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="deepseek-ai/DeepSeek-V2-Lite"
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 vllm serve "$MODEL" \
  --max-model-len 2048 \
  --offload-group-size 8 \
  --offload-num-in-group 2 \
  --offload-prefetch-step 1 \
  --offload-params w13_weight w2_weight \
  --port "$PORT" &
 SERVER_PID=$!
 wait_for_server "$PORT"
 TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
 OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
 python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
 python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
 PY
 cleanup
 SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend $BACK \
+    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size ${DATA_PARALLEL_SIZE} \
+    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port $PORT &
+    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend $BACK \
+    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
-    --port $PORT &
+    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,10 +9,11 @@ ENV_FILE=$1
 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-source $ENV_FILE
+# shellcheck source=/dev/null
 source "$ENV_FILE"
 remove_docker_container() { 
-    docker rm -f $CONTAINER_NAME || true;
+    docker rm -f "$CONTAINER_NAME" || true;
 }
 trap remove_docker_container EXIT
@@ -41,13 +42,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file $ENV_FILE \
+ --env-file "$ENV_FILE" \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL=$MODEL \
+ -e MODEL="$MODEL" \
 -e WORKSPACE=/workspace \
- --name $CONTAINER_NAME \
+ --name "$CONTAINER_NAME" \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-vllm serve $MODEL \
+vllm serve "$MODEL" \
 --seed 42 \
- --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
 --no-enable-prefix-caching \
- --download_dir $DOWNLOAD_DIR \
+ --download_dir "$DOWNLOAD_DIR" \
- --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+ --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for i in {1..120}; do
+for _ in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model $MODEL  \
+    --model "$MODEL"  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len $INPUT_LEN \
+    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len $OUTPUT_LEN \
+    --sonnet-output-len "$OUTPUT_LEN" \
    --ignore-eos > "$BM_LOG"
 echo "completed..."
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+alias_args=()
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-else
+    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
    alias_arg=""
 fi
 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -100,9 +99,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "$INDICES_OUTPUT_DIR/*"
+    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
 GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
 echo "Release version from Buildkite: $RELEASE_VERSION"
-GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+
-if [ -z "$GIT_VERSION" ]; then
+if [[ -z "$GIT_VERSION" ]]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
-if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
-  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
@@ -27,7 +29,7 @@ fi
 PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
 # check pypi token
-if [ -z "$PYPI_TOKEN" ]; then
+if [[ -z "$PYPI_TOKEN" ]]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
@@ -35,41 +37,8 @@ else
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi
 # check github token
 if [ -z "$GITHUB_TOKEN" ]; then
  echo "[FATAL] GITHUB_TOKEN is not set."
  exit 1
 else
  export GH_TOKEN="$GITHUB_TOKEN"
 fi
 set -x # avoid printing secrets above
 # download gh CLI from github
 # Get latest gh CLI version from GitHub API
 GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
 if [ -z "$GH_VERSION" ]; then
  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
  exit 1
 fi
 echo "Downloading gh CLI version: $GH_VERSION"
 GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
 GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
 GH_INSTALL_DIR="/tmp/gh-install"
 mkdir -p "$GH_INSTALL_DIR"
 pushd "$GH_INSTALL_DIR"
 curl -L -o "$GH_TARBALL" "$GH_URL"
 tar -xzf "$GH_TARBALL"
 GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
 if [ -z "$GH_BIN" ]; then
  echo "[FATAL] Failed to find gh CLI executable"
  exit 1
 fi
 echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
 echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
 command "$GH_BIN" release list --limit 5
 popd
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
@@ -86,19 +55,16 @@ mkdir -p $DIST_DIR
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
 ls -la $DIST_DIR
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [ -z "$PYPI_WHEEL_FILES" ]; then
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
 python3 -m twine check $PYPI_WHEEL_FILES
 python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
-# create release on GitHub with the release version and all wheels
+python3 -m twine check "$PYPI_WHEEL_FILES"
-command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
 echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
-WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"
 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi
 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,7 +4,7 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:
 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  gpu: b200
+  device: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,3 +14,8 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,3 +17,15 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
 - label: Attention Benchmarks Smoke Test (B200)
  device: b200
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  timeout_in_minutes: 10
  source_file_dependencies:
  - benchmarks/attention_benchmarks/
  - vllm/v1/attention/
  commands:
  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,200 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
+- label: Sequence Parallel Correctness Tests (2 GPUs)
-  timeout_in_minutes: 40
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  num_devices: 2
  source_file_dependencies:
-  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/compilation/
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
-    - nvidia-smi
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - pytest -v -s tests/compile/test_fusion_attn.py
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
+- label: Sequence Parallel Correctness Tests (2xH100)
-  timeout_in_minutes: 40
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: h100
  optional: true
-  num_gpus: 2
+  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 - label: AsyncTP Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
  - tests/compile/passes/distributed/
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -s -v tests/compile/passes/distributed
 - label: Fusion and Compile Unit Tests (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/attention/attention.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/passes/test_fusion_attn.py
  - tests/compile/passes/test_silu_mul_quant_fusion.py
  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 - label: Fusion E2E Quick (H100)
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
+    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
 - label: Fusion E2E Config Sweep (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 1
  optional: true
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
 - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
 - label: Fusion E2E TP2 (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,6 +9,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
    - pytest -v -s cuda/test_platform_no_cuda_init.py
 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -16,9 +16,9 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py
 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -47,14 +47,13 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -63,6 +62,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -97,14 +97,19 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  gpu: h100
+  device: h100
-  num_gpus: 8
+  num_devices: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +125,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: Distributed Tests (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,26 +138,23 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
+- label: Distributed Tests (2 GPUs)(H100)
-  gpu: h200
+  timeout_in_minutes: 15
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
 - label: Distributed Tests (2 GPUs)(B200)
-  gpu: b200
+  device: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +163,10 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  num_nodes: 2
  no_plugin: true
  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -171,12 +175,12 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -184,10 +188,32 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -196,4 +222,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,39 +4,36 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs)
+- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 30
+  timeout_in_minutes: 60
  device: h100
  optional: true
-  soft_fail: true
+  num_devices: 1
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,7 +14,7 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine
+- label: V1 e2e + engine (1 GPU)
  timeout_in_minutes: 45
  source_file_dependencies:
    - vllm/
@@ -23,4 +23,48 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - pytest -v -s v1/e2e
      - pytest -v -s v1/engine
 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 2
  source_file_dependencies:
    - vllm/
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
      depends_on:
      - image-build-amd
 - label: V1 e2e (4 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 4
  source_file_dependencies:
    - vllm/
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,6 +24,11 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -42,15 +47,13 @@ steps:
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/tool_use
  - tests/entrypoints/sleep
  - tests/entrypoints/instrumentator
  - tests/entrypoints/rpc
  - tests/entrypoints/instrumentator
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s entrypoints/instrumentator
-  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use
 - label: Entrypoints Integration (Pooling)
@@ -62,6 +65,11 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,10 +14,25 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 - label: Elastic EP Scaling Test
  timeout_in_minutes: 20
  device: b200
  optional: true
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/compilation/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_elastic_ep.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,8 +15,9 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -43,7 +44,8 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Mamba Test
@@ -57,8 +59,8 @@ steps:
 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  gpu: h100
+  device: h100
-  num_gpus: 1
+  num_devices: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -69,7 +71,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/quantization/test_block_fp8.py
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -77,7 +79,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -85,7 +87,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -114,4 +116,54 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
 - label: Kernels Helion Test
  timeout_in_minutes: 30
  device: h100
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  commands:
    - pip install helion
    - pytest -v -s kernels/helion/
 - label: Kernels FP8 MoE Test (1 H100)
  timeout_in_minutes: 90
  device: h100
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutlass_moe.py
    - pytest -v -s kernels/moe/test_flashinfer.py
    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
    - pytest -v -s kernels/moe/test_moe.py
    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
    - pytest -v -s kernels/moe/test_block_int8.py
    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
 - label: Kernels FP8 MoE Test (2 H100s)
  timeout_in_minutes: 90
  device: h100
  num_devices: 2
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
 - label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
  device: b200
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutedsl_moe.py
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -11,22 +11,22 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100)
+# - label: LM Eval Large Models (4 GPUs)(A100)
-  gpu: a100
+#   device: a100
-  optional: true
+#   optional: true
-  num_gpus: 4
+#   num_devices: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
+#   source_file_dependencies:
-  - csrc/
+#   - csrc/
-  - vllm/model_executor/layers/quantization
+#   - vllm/model_executor/layers/quantization
-  commands:
+#   commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,10 +37,65 @@ steps:
 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 - label: LM Eval Large Models (H200)
  timeout_in_minutes: 60
  device: h200
  optional: true
  num_devices: 8
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
 - label: MoE Refactor Integration Test (H100 - TEMPORARY)
  device: h100
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
 - label: MoE Refactor Integration Test (B200 - TEMPORARY)
  device: b200
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
 - label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
  device: b200
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
 - label: GPQA Eval (GPT-OSS) (H100)
  timeout_in_minutes: 120
  device: h100
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
 - label: GPQA Eval (GPT-OSS) (B200)
  timeout_in_minutes: 120
  device: b200
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:
 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,6 +9,7 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -16,7 +17,8 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    # TODO: create another `optional` test group for slow tests
    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -25,13 +27,19 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: V1 Others (CPU)
-  depends_on: ~
+  depends_on:
    - image-build-cpu
  source_file_dependencies:
    - vllm/
    - tests/v1
-  no_gpu: true
+  device: cpu
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -71,7 +79,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -82,7 +90,7 @@ steps:
 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -107,19 +115,24 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: ~
+  depends_on: 
  - image-build-cpu
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/test_pooling_params.py
  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -127,11 +140,13 @@ steps:
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  no_gpu: true
+  device: cpu
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s test_pooling_params.py
  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -139,23 +154,9 @@ steps:
  - pytest -v -s transformers_utils
  - pytest -v -s config
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
@@ -164,4 +165,18 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: Acceptance Length Test (Large Models) # optional
  timeout_in_minutes: 25
  gpu: h100
  optional: true
  num_gpus: 1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/mlp_speculator.py
  - tests/v1/spec_decode/test_acceptance_length.py
  commands:
    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -16,7 +15,6 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -33,18 +31,27 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  no_gpu: true
+  device: cpu
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -16,7 +15,6 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -32,7 +30,6 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -40,7 +37,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -48,7 +45,6 @@ steps:
 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -56,13 +52,21 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -72,17 +76,20 @@ steps:
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -14,11 +14,14 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  no_gpu: true
+  - tests/models/registry.py
  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
@@ -28,6 +31,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -68,12 +72,3 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
@@ -19,6 +19,10 @@ steps:
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,8 +17,16 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 - label: PyTorch Compilation Passes Unit Tests
  timeout_in_minutes: 20
  source_file_dependencies:
    - vllm/
    - tests/compile/passes
  commands:
  - pytest -s -v compile/passes --ignore compile/passes/distributed
 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +38,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -0,0 +1,16 @@
 group: Ray Compatibility
 depends_on:
  - image-build
 steps:
 - label: Ray Dependency Compatibility Check
  # Informational only — does not block the pipeline.
  # If this fails, it means the PR introduces a dependency that
  # conflicts with Ray's dependency constraints.
  # See https://github.com/vllm-project/vllm/issues/33599
  soft_fail: true
  timeout_in_minutes: 10
  source_file_dependencies:
  - requirements/
  - setup.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,3 +12,10 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - pytest -v -s samplers
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU - Large Models # optional
+# - label: Weight Loading Multiple GPU - Large Models # optional
-  working_dir: "/vllm-workspace/tests"
+#   working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+#   num_devices: 2
-  gpu: a100
+#   device: a100
-  optional: true
+#   optional: true
-  source_file_dependencies:
+#   source_file_dependencies:
-  - vllm/
+#   - vllm/
-  - tests/weight_loading
+#   - tests/weight_loading
-  commands:
+#   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -1,24 +0,0 @@
 # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 version: 1
 paths:
 # We temporarily disable globally, and will only enable with `annotations.include`
 # include:
 #   - "vllm/v1/attetion/*.py"
 #   - "vllm/v1/core/*.py"
 exclude:
  - "**/*.py"
 scan:
  functions: true        # check free functions and methods
  classes: true          # check classes/dataclasses
  public_only: true      # ignore names starting with "_" at any level
 annotations:
  include:               # decorators that force‑include a symbol
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
      propagate_to_members: false # for classes, include methods/inner classes
  exclude:               # decorators that force‑exclude a symbol
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
      propagate_to_members: true  # for classes, exclude methods/inner classes
 excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,43 +2,66 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention @LucasWilkinson
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config/cache.py @heheda12345
 # Entrypoints
 /vllm/entrypoints/anthropic @mgoin @DarkLight1337
 /vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
 /vllm/entrypoints/mcp @heheda12345
 /vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
 /vllm/entrypoints/openai/realtime @njhill
 /vllm/entrypoints/openai/speech_to_text @NickLucche
 /vllm/entrypoints/pooling @noooop
 /vllm/entrypoints/sagemaker @DarkLight1337
 /vllm/entrypoints/serve @njhill
 /vllm/entrypoints/*.py @njhill
 /vllm/entrypoints/chat_utils.py @DarkLight1337
 /vllm/entrypoints/llm.py @DarkLight1337
 # Input/Output Processing
 /vllm/sampling_params.py @njhill @NickLucche
 /vllm/pooling_params.py @noooop @DarkLight1337
 /vllm/tokenizers @DarkLight1337 @njhill
 /vllm/renderers @DarkLight1337 @njhill
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 # vLLM V1
-/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention @LucasWilkinson @MatthewBonanni
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/offloading @ApostaC
+/vllm/v1/kv_offload @ApostaC @orozery
 /vllm/v1/engine @njhill
 /vllm/v1/executor @njhill
 /vllm/v1/worker @njhill
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon
+/vllm/v1/worker/gpu @WoosukKwon @njhill
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
@@ -54,13 +77,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC
+/tests/v1/kv_connector @ApostaC @orozery
-/tests/v1/offloading @ApostaC
+/tests/v1/kv_offload @ApostaC @orozery
 /tests/v1/determinism @yewentao256 
 # Transformers modeling backend
@@ -113,8 +136,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -150,9 +173,7 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop
 # Security guide and policies
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -259,8 +259,7 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/structured_outputs/structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -1,29 +0,0 @@
 name: BC Lint
 on:
  pull_request:
    types:
      - opened
      - synchronize
      - reopened
      - labeled
      - unlabeled
 jobs:
  bc_lint:
    if: github.repository_owner == 'vllm-project'
    runs-on: ubuntu-latest
    steps:
      - name: Run BC Lint Action
        uses: pytorch/test-infra/.github/actions/bc-lint@main
        with:
          repo: ${{ github.event.pull_request.head.repo.full_name }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
          config_dir: .github
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,6 +19,7 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
          cache: 'pip'
      - name: Install Python dependencies
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 !vllm/vllm_flash_attn/__init__.py
 !vllm/vllm_flash_attn/flash_attn_interface.py
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
@@ -238,3 +240,6 @@ ep_kernels_workspace/
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
 # Ignore generated cpu headers 
 csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,24 +121,9 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
+  - id: check-forbidden-imports
-    name: Enforce import regex as re
+    name: Check for forbidden imports
-    entry: python tools/pre_commit/enforce_regex_import.py
+    entry: python tools/pre_commit/check_forbidden_imports.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
@@ -154,6 +139,15 @@ repos:
    files: ^docker/(Dockerfile|versions\.json)$
    pass_filenames: false
    additional_dependencies: [dockerfile-parse]
  - id: attention-backend-docs
    name: Check attention backend documentation is up to date
    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
    language: python
  - id: check-boolean-context-manager
    name: Check for boolean ops in with-statements
    entry: python tools/pre_commit/check_boolean_context_manager.py
    language: python
    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,13 +9,14 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      - git fetch --unshallow || true
+      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
    create_environment:
      - uv venv $READTHEDOCS_VIRTUALENV_PATH
    install:
      - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt 
 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
    - requirements: requirements/docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -293,6 +293,7 @@ set(VLLM_EXT_SRC
  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/topk.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -433,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()
-    if (MARLIN_SM75_ARCHS) 
+    if (MARLIN_SM75_ARCHS)
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -445,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()
-    if (MARLIN_FP8_ARCHS) 
+    if (MARLIN_FP8_ARCHS)
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -458,7 +459,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/marlin.cu"
       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/marlin/gptq_marlin_repack.cu"
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # CUTLASS MoE kernels
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
@@ -771,6 +771,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
    set(SRCS
      "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
      "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
    message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
        AND ES_MXFP8_GROUPED_MM_ARCHS)
      message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
                     "not >= 12.8.")
    else()
      message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
  else()
    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
                   "in CUDA target architectures.")
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -953,7 +998,8 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu"
    "csrc/moe/router_gemm.cu")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -1043,7 +1089,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()
-    if (MARLIN_MOE_SM75_ARCHS) 
+    if (MARLIN_MOE_SM75_ARCHS)
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
@@ -1082,6 +1128,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
  # DeepSeek V3 router GEMM kernel - requires SM90+
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
    set(DSV3_ROUTER_GEMM_SRC
      "csrc/moe/dsv3_router_gemm_entry.cu"
      "csrc/moe/dsv3_router_gemm_float_out.cu"
      "csrc/moe/dsv3_router_gemm_bf16_out.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_ROUTER_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
    list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
    message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
  else()
    message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
                   " (requires SM90+ and CUDA >= 12.0)")
  endif()
 endif()
 message(STATUS "Enabling moe extension.")
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
 ## Usage
-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
 For full CLI reference see:
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -0,0 +1,266 @@
 # vLLM Attention Benchmarking Suite
 Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
 ## Quick Start
 ```bash
 cd benchmarks/attention_benchmarks
 # Run a pre-configured benchmark
 python benchmark.py --config configs/mla_decode.yaml
 python benchmark.py --config configs/mla_mixed_batch.yaml
 python benchmark.py --config configs/speculative_decode.yaml
 python benchmark.py --config configs/standard_attention.yaml
 python benchmark.py --config configs/reorder_threshold.yaml
 # Or run custom benchmarks
 python benchmark.py \
    --backends flash flashinfer \
    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
    --output-csv results.csv
 ```
 ## Simplified Batch Specification Grammar
 Express workloads concisely using query length and sequence length:
 ```python
 "q2k"              # 2048-token prefill (q_len=2048, seq_len=2048)
 "q1s1k"            # Decode: 1 token with 1K sequence
 "8q1s1k"           # 8 decode requests
 "q4s1k"            # 4-token extend (e.g., spec decode)
 "2q2k_32q1s1k"     # Mixed: 2 prefills + 32 decodes
 "16q4s1k"          # 16 spec decode (4 tokens each)
 ```
 ### Grammar Rule
 ```text
 Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 - count:   Number of identical requests (optional, default=1)
 - q_len:   Query length (number of new tokens)
 - seq_len: Total sequence length (optional, defaults to q_len for prefill)
 - 'k':     Multiplies value by 1024
 Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
 ```
 **Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
 ## Pre-configured Benchmarks
 The suite includes several pre-configured YAML benchmark configurations:
 ### MLA Decode Benchmark
 Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
 ```bash
 python benchmark.py --config configs/mla_decode.yaml
 ```
 ### MLA Mixed Batch Benchmark
 Tests chunked prefill performance with mixed prefill + decode batches.
 ```bash
 python benchmark.py --config configs/mla_mixed_batch.yaml
 ```
 ### Speculative Decoding Benchmark
 Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
 ```bash
 python benchmark.py --config configs/speculative_decode.yaml
 ```
 ### Standard Attention Benchmark
 Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
 ```bash
 python benchmark.py --config configs/standard_attention.yaml
 ```
 ### Reorder Threshold Study
 **Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
 Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
 ```bash
 python benchmark.py --config configs/reorder_threshold.yaml
 ```
 ---
 ## Universal Benchmark
 The `benchmark.py` script handles **all** backends - both standard attention and MLA.
 ### Standard Attention (Flash/Triton/FlashInfer)
 ```bash
 python benchmark.py \
    --backends flash triton flashinfer \
    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
    --num-layers 10 \
    --repeats 5 \
    --output-csv results.csv
 ```
 ### MLA Backends
 ```bash
 # Compare all MLA backends
 python benchmark.py \
    --backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
    --batch-specs "64q1s1k" "64q1s4k" \
    --output-csv mla_results.csv
 ```
 ### Parameter Sweeps
 Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
 #### CUTLASS MLA num-splits Optimization
 **Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
 ```bash
 python benchmark.py \
    --backend cutlass_mla \
    --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
    --sweep-param num_kv_splits \
    --sweep-values 1 2 4 8 16 \
    --output-json optimal_splits.json
 ```
 #### Reorder Batch Threshold Optimization
 **Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
 ```bash
 python benchmark.py \
    --backend flashmla \
    --batch-specs "q4s1k" "q8s2k" \
    --sweep-param reorder_batch_threshold \
    --sweep-values 1 4 16 64 256 512 \
    --output-csv threshold_sweep.csv
 ```
 ### All Command-Line Options
 ```text
 --config CONFIG                     # Path to YAML config file (overrides other args)
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                    # flashinfer_mla, flashattn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
 --batch-specs SPEC [SPEC ...]       # Batch specifications using extended grammar
 # Model configuration
 --num-layers N                      # Number of layers
 --head-dim N                        # Head dimension
 --num-q-heads N                     # Query heads
 --num-kv-heads N                    # KV heads
 --block-size N                      # Block size
 # Benchmark settings
 --device DEVICE                     # Device (default: cuda:0)
 --repeats N                         # Repetitions
 --warmup-iters N                    # Warmup iterations
 --profile-memory                    # Profile memory usage
 # Parameter sweeps
 --sweep-param PARAM                 # Parameter name to sweep (e.g., num_kv_splits,
                                    # reorder_batch_threshold)
 --sweep-values N [N ...]            # Values to sweep for the parameter
 # Output
 --output-csv FILE                   # Save to CSV
 --output-json FILE                  # Save to JSON
 ```
 ## Hardware Requirements
 | Backend | Hardware |
 |---------|----------|
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
 | FlashMLA | Hopper (SM90+) |
 | FlashInfer-MLA | Any CUDA GPU |
 ## Using MLA Runner Directly
 All MLA backends are available through `mla_runner.run_mla_benchmark()`:
 ```python
 from mla_runner import run_mla_benchmark
 from common import BenchmarkConfig
 config = BenchmarkConfig(
    backend="cutlass_mla",
    batch_spec="64q1s4k",
    num_layers=10,
    head_dim=576,
    num_q_heads=128,
    num_kv_heads=1,
    block_size=128,
    device="cuda:0",
    repeats=5,
    warmup_iters=3,
 )
 # CUTLASS MLA with specific num_kv_splits
 result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
 print(f"Time: {result.mean_time:.6f}s")
 # FlashInfer-MLA
 result = run_mla_benchmark("flashinfer_mla", config)
 # FlashAttn MLA (Hopper SM90+)
 result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
 # FlashMLA (Hopper SM90+)
 result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
 ```
 ## Python API
 ```python
 from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
 from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
 # Parse batch specs
 requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
 print(format_batch_spec(requests))
 # "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
 # Get batch statistics
 stats = get_batch_stats(requests)
 print(f"Total tokens: {stats['total_tokens']}")
 print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
 # Format results
 formatter = ResultsFormatter()
 formatter.save_csv(results, "output.csv")
 formatter.save_json(results, "output.json")
 ```
 ## Tips
 **1. Warmup matters** - Use `--warmup-iters 10` for stable results
 **2. Multiple repeats** - Use `--repeats 20` for low variance
 **3. Save results** - Always use `--output-csv` or `--output-json`
 **4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
 **5. Extended grammar** - Leverage spec decode, chunked prefill patterns
 **6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -0,0 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM Attention Benchmarking Suite."""
 from .batch_spec import (
    BatchRequest,
    format_batch_spec,
    get_batch_stats,
    parse_batch_spec,
    reorder_for_flashinfer,
    split_by_type,
 )
 from .common import (
    BenchmarkConfig,
    BenchmarkResult,
    MockLayer,
    ResultsFormatter,
    get_attention_scale,
    is_mla_backend,
    setup_mla_dims,
 )
 __all__ = [
    # Batch specification
    "BatchRequest",
    "parse_batch_spec",
    "format_batch_spec",
    "reorder_for_flashinfer",
    "split_by_type",
    "get_batch_stats",
    # Benchmarking infrastructure
    "BenchmarkConfig",
    "BenchmarkResult",
    "ResultsFormatter",
    # Mock objects
    "MockLayer",
    # Utilities
    "setup_mla_dims",
    "get_attention_scale",
    "is_mla_backend",
 ]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -0,0 +1,268 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simplified batch specification grammar for attention benchmarks.
 Grammar (underscore-separated segments):
  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
  - count: Number of identical requests (optional, default=1)
  - q_len: Query length (number of new tokens)
  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
  - 'k' suffix: Multiplies value by 1024
 Common patterns:
  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
 Examples:
  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
 """
 from collections import Counter
 from dataclasses import dataclass
 import regex as re
@dataclass
 class BatchRequest:
    """Represents a single request in a batch."""
    q_len: int  # Query length (number of new tokens)
    kv_len: int  # Total KV cache length
    @property
    def is_decode(self) -> bool:
        """True if this is a decode request (q_len == 1)."""
        return self.q_len == 1
    @property
    def is_prefill(self) -> bool:
        """True if this is a pure prefill (q_len == kv_len)."""
        return self.q_len == self.kv_len
    @property
    def is_extend(self) -> bool:
        """True if this is context extension (q_len > 1, kv_len > q_len)."""
        return self.q_len > 1 and self.kv_len > self.q_len
    @property
    def context_len(self) -> int:
        """Context length (KV cache - query)."""
        return self.kv_len - self.q_len
    def as_tuple(self) -> tuple[int, int]:
        """Return as (q_len, kv_len) tuple for compatibility."""
        return (self.q_len, self.kv_len)
 def _parse_size(size_str: str, k_suffix: str) -> int:
    """Parse size string with optional 'k' suffix."""
    size = int(size_str)
    return size * 1024 if k_suffix == "k" else size
 def parse_batch_spec(spec: str) -> list[BatchRequest]:
    """
    Parse batch specification string into list of BatchRequest objects.
    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
    Args:
        spec: Batch specification string (see module docstring for grammar)
    Returns:
        List of BatchRequest objects
    Raises:
        ValueError: If spec format is invalid
    """
    requests = []
    for seg in spec.split("_"):
        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
        if m:
            cnt = int(m.group(1)) if m.group(1) else 1
            q_len = _parse_size(m.group(2), m.group(3))
            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
            continue
        raise ValueError(f"Invalid batch spec segment: '{seg}'")
    return requests
 def format_batch_spec(requests: list[BatchRequest]) -> str:
    """
    Format list of BatchRequest into human-readable string.
    Groups requests by type and provides counts and sizes.
    Args:
        requests: List of BatchRequest objects
    Returns:
        Formatted string describing the batch
    """
    kinds = {
        "prefill": [],
        "extend": [],
        "decode": [],
    }
    for req in requests:
        tup = (req.q_len, req.kv_len)
        if req.is_prefill:
            kinds["prefill"].append(tup)
        elif req.is_extend:
            kinds["extend"].append(tup)
        elif req.is_decode:
            kinds["decode"].append(tup)
    parts = []
    for kind in ["prefill", "extend", "decode"]:
        lst = kinds[kind]
        if not lst:
            continue
        cnt_total = len(lst)
        ctr = Counter(lst)
        inner = []
        for (q, kv), cnt in ctr.items():
            if kind == "prefill":
                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                inner.append(f"{cnt}x{size}")
            elif kind == "decode":
                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
                inner.append(f"{cnt}x{size}")
            else:  # extend
                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
                inner.append(f"{cnt}xq{qstr}kv{kstr}")
        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
    return ", ".join(parts)
 def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
    """
    Reorder requests for FlashInfer: decode first, then prefill.
    FlashInfer expects decode requests before prefill requests for
    optimal performance.
    Args:
        requests: Original list of BatchRequest
    Returns:
        Reordered list with decode requests first
    """
    decodes = [r for r in requests if r.is_decode]
    non_decodes = [r for r in requests if not r.is_decode]
    return decodes + non_decodes
 def split_by_type(
    requests: list[BatchRequest],
 ) -> dict[str, list[BatchRequest]]:
    """
    Split requests by type for analysis.
    Args:
        requests: List of BatchRequest
    Returns:
        Dict with keys: 'decode', 'prefill', 'extend'
    """
    result = {
        "decode": [],
        "prefill": [],
        "extend": [],
    }
    for req in requests:
        if req.is_decode:
            result["decode"].append(req)
        elif req.is_prefill:
            result["prefill"].append(req)
        elif req.is_extend:
            result["extend"].append(req)
    return result
 def get_batch_stats(requests: list[BatchRequest]) -> dict:
    """
    Compute statistics about a batch.
    Args:
        requests: List of BatchRequest
    Returns:
        Dict with batch statistics
    """
    by_type = split_by_type(requests)
    return {
        "total_requests": len(requests),
        "num_decode": len(by_type["decode"]),
        "num_prefill": len(by_type["prefill"]),
        "num_extend": len(by_type["extend"]),
        "total_tokens": sum(r.q_len for r in requests),
        "total_kv_cache": sum(r.kv_len for r in requests),
        "max_q_len": max((r.q_len for r in requests), default=0),
        "max_kv_len": max((r.kv_len for r in requests), default=0),
        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
        "avg_kv_len": (
            sum(r.kv_len for r in requests) / len(requests) if requests else 0
        ),
    }
 def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
    """
    Classify a batch spec into a type string.
    Args:
        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
    Returns:
        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
    """
    requests = parse_batch_spec(batch_spec)
    # Classify each request
    types_present = set()
    for req in requests:
        if req.is_decode:
            types_present.add("decode")
        elif req.is_prefill:
            types_present.add("prefill")
        elif req.is_extend:
            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
            if req.q_len <= spec_decode_threshold:
                types_present.add("spec-decode")
            else:
                types_present.add("extend")
    if len(types_present) == 1:
        return types_present.pop()
    elif len(types_present) > 1:
        # Sort for consistent output
        sorted_types = sorted(types_present)
        return f"mixed ({'+'.join(sorted_types)})"
    else:
        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -0,0 +1,895 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Universal vLLM Attention Benchmark
 Benchmark any attention backend with the extended grammar.
 Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
 Examples:
    # Standard attention
    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
    # MLA backends
    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
    # Parameter sweep (CLI)
    python benchmark.py --backend cutlass_mla \
                        --batch-specs "64q1s1k" \
                        --sweep-param num_kv_splits \
                        --sweep-values 1 4 8 16
    # Parameter sweep (YAML config - recommended)
    python benchmark.py --config configs/cutlass_numsplits.yaml
 """
 import argparse
 import sys
 from dataclasses import replace
 from pathlib import Path
 import yaml
 from rich.console import Console
 from tqdm import tqdm
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkConfig,
    BenchmarkResult,
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
    batch_spec_sort_key,
    is_mla_backend,
 )
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
    from runner import run_attention_benchmark
    return run_attention_benchmark(config)
 def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla
    return run_mla(config.backend, config, **kwargs)
 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """
    Run a single benchmark with proper backend selection.
    Args:
        config: BenchmarkConfig with backend, batch_spec, and model params
        **kwargs: Additional arguments passed to MLA benchmarks
    Returns:
        BenchmarkResult (may have error field set on failure)
    """
    try:
        if is_mla_backend(config.backend):
            return run_mla_benchmark(config, **kwargs)
        else:
            return run_standard_attention_benchmark(config)
    except Exception as e:
        return BenchmarkResult(
            config=config,
            mean_time=float("inf"),
            std_time=0,
            min_time=float("inf"),
            max_time=float("inf"),
            error=str(e),
        )
 def run_model_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ModelParameterSweep,
    console: Console,
 ) -> list[BenchmarkResult]:
    """
    Run model parameter sweep for given backends and batch specs.
    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ModelParameterSweep configuration
        console: Rich console for output
    Returns:
        List of BenchmarkResult objects
    """
    all_results = []
    console.print(
        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
    )
    total = len(backends) * len(batch_specs) * len(sweep.values)
    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep.values:
                    # Create config with modified model parameter
                    config_args = base_config_args.copy()
                    config_args[sweep.param_name] = value
                    # Create config with original backend for running
                    clean_config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **config_args
                    )
                    # Run benchmark
                    result = run_benchmark(clean_config)
                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)
                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )
                    pbar.update(1)
    # Display sweep results - create separate table for each parameter value
    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
    formatter = ResultsFormatter(console)
    # Group results by parameter value and extract backend mapping
    by_param_value = {}
    backend_mapping = {}  # Maps labeled backend -> original backend
    for r in all_results:
        # Extract original backend and param value from labeled backend
        # The label format is: {backend}_{param_name}_{value}
        # We need to reverse engineer this
        labeled_backend = r.config.backend
        # Try each backend to find which one this result belongs to
        for backend in backends:
            for value in sweep.values:
                expected_label = sweep.get_label(backend, value)
                if labeled_backend == expected_label:
                    backend_mapping[labeled_backend] = backend
                    param_value = str(value)
                    if param_value not in by_param_value:
                        by_param_value[param_value] = []
                    by_param_value[param_value].append(r)
                    break
    # Create a table for each parameter value
    sorted_param_values = sorted(
        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
    )
    for param_value in sorted_param_values:
        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
        param_results = by_param_value[param_value]
        # Create modified results with original backend names
        modified_results = []
        for r in param_results:
            # Get the original backend name from our mapping
            original_backend = backend_mapping[r.config.backend]
            modified_config = replace(r.config, backend=original_backend)
            modified_result = replace(r, config=modified_config)
            modified_results.append(modified_result)
        # Print table with original backend names
        formatter.print_table(modified_results, backends, compare_to_fastest=True)
    # Show optimal backend for each (param_value, batch_spec) combination
    console.print(
        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
    )
    # Group by (param_value, batch_spec)
    by_param_and_spec = {}
    for r in all_results:
        if r.success:
            # Find which (backend, value) this result corresponds to
            labeled_backend = r.config.backend
            for backend in backends:
                for value in sweep.values:
                    expected_label = sweep.get_label(backend, value)
                    if labeled_backend == expected_label:
                        param_value = str(value)
                        spec = r.config.batch_spec
                        key = (param_value, spec)
                        if key not in by_param_and_spec:
                            by_param_and_spec[key] = []
                        by_param_and_spec[key].append(r)
                        break
    # Sort by param value then spec (batch_size, q_len, kv_len)
    sorted_keys = sorted(
        by_param_and_spec.keys(),
        key=lambda x: (
            int(x[0]) if x[0].isdigit() else x[0],
            batch_spec_sort_key(x[1]),
        ),
    )
    current_param_value = None
    for param_value, spec in sorted_keys:
        # Print header when param value changes
        if param_value != current_param_value:
            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
            current_param_value = param_value
        results = by_param_and_spec[(param_value, spec)]
        best = min(results, key=lambda r: r.mean_time)
        # Extract original backend name using the mapping
        backend_name = backend_mapping[best.config.backend]
        # Show all backends' times for comparison
        times_str = " | ".join(
            [
                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
                for r in sorted(results, key=lambda r: r.mean_time)
            ]
        )
        console.print(
            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
        )
    return all_results
 def run_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ParameterSweep,
    console: Console,
 ) -> list[BenchmarkResult]:
    """
    Run parameter sweep for given backends and batch specs.
    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ParameterSweep configuration
        console: Rich console for output
    Returns:
        List of BenchmarkResult objects
    """
    all_results = []
    # Build list of values to sweep (including auto if requested)
    sweep_values = list(sweep.values)
    if sweep.include_auto:
        sweep_values.append("auto")
    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
    total = len(backends) * len(batch_specs) * len(sweep_values)
    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep_values:
                    # Create config with original backend for running
                    config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **base_config_args
                    )
                    # Prepare kwargs for benchmark runner
                    kwargs = {}
                    if value != "auto":
                        kwargs[sweep.param_name] = value
                    # Run benchmark
                    result = run_benchmark(config, **kwargs)
                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)
                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )
                    pbar.update(1)
    # Display sweep results
    console.print("\n[bold green]Sweep Results:[/]")
    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
    formatter = ResultsFormatter(console)
    formatter.print_table(all_results, backend_labels)
    # Show optimal values
    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
    by_spec = {}
    for r in all_results:
        if r.success:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = []
            by_spec[spec].append(r)
    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
        )
    return all_results
 def load_config_from_yaml(config_path: str) -> dict:
    """Load configuration from YAML file."""
    with open(config_path) as f:
        return yaml.safe_load(f)
 def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
    """
    Generate batch specs from range specifications.
    Args:
        ranges: List of range specifications, each containing:
            - template: Batch spec template (e.g., "q{q_len}kv1k")
            - q_len: Dict with start, stop, step, end_inclusive (optional)
            - Other parameters can also be ranges
    Returns:
        List of generated batch spec strings
    Example:
        ranges = [
            {
                "template": "q{q_len}kv1k",
                "q_len": {
                    "start": 1,
                    "stop": 16,
                    "step": 1,
                    "end_inclusive": true  # Optional, defaults to true
                }
            }
        ]
        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
    """
    all_specs = []
    for range_spec in ranges:
        template = range_spec.get("template")
        if not template:
            raise ValueError("Range specification must include 'template'")
        # Extract all range parameters from the spec
        range_params = {}
        for key, value in range_spec.items():
            if key == "template":
                continue
            if isinstance(value, dict) and "start" in value:
                # This is a range specification
                start = value["start"]
                stop = value["stop"]
                step = value.get("step", 1)
                # Check if end should be inclusive (default: True)
                end_inclusive = value.get("end_inclusive", True)
                # Adjust stop based on end_inclusive
                if end_inclusive:
                    range_params[key] = list(range(start, stop + 1, step))
                else:
                    range_params[key] = list(range(start, stop, step))
            else:
                # This is a fixed value
                range_params[key] = [value]
        # Generate all combinations (Cartesian product)
        if range_params:
            import itertools
            param_names = list(range_params.keys())
            param_values = [range_params[name] for name in param_names]
            for values in itertools.product(*param_values):
                params = dict(zip(param_names, values))
                spec = template.format(**params)
                all_specs.append(spec)
        else:
            # No parameters, just use template as-is
            all_specs.append(template)
    return all_specs
 def main():
    parser = argparse.ArgumentParser(
        description="Universal vLLM attention benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    # Config file
    parser.add_argument(
        "--config",
        help="Path to YAML config file (overrides other args)",
    )
    # Backend selection
    parser.add_argument(
        "--backends",
        nargs="+",
        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
        default=["q2k", "8q1s1k"],
        help="Batch specifications using extended grammar",
    )
    # Model config
    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
    parser.add_argument("--block-size", type=int, default=16, help="Block size")
    # Benchmark settings
    parser.add_argument("--device", default="cuda:0", help="Device")
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
        "--sweep-param",
        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
    )
    parser.add_argument(
        "--sweep-values",
        type=int,
        nargs="+",
        help="Values to sweep for the parameter",
    )
    # Output
    parser.add_argument("--output-csv", help="Save to CSV")
    parser.add_argument("--output-json", help="Save to JSON")
    args = parser.parse_args()
    console = Console()
    console.print("[bold cyan]vLLM Attention Benchmark[/]")
    # Load config from YAML if provided
    if args.config:
        console.print(f"[yellow]Loading config from: {args.config}[/]")
        yaml_config = load_config_from_yaml(args.config)
        # Show description if available
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")
        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
        cli_backends_provided = args.backends is not None or args.backend is not None
        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
            if "backend" in yaml_config:
                args.backend = yaml_config["backend"]
                args.backends = None
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
        # Check for special modes
        if "mode" in yaml_config:
            args.mode = yaml_config["mode"]
        else:
            args.mode = None
        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
        if "batch_spec_ranges" in yaml_config:
            # Generate batch specs from ranges
            generated_specs = generate_batch_specs_from_ranges(
                yaml_config["batch_spec_ranges"]
            )
            # Combine with any explicit batch_specs
            if "batch_specs" in yaml_config:
                args.batch_specs = yaml_config["batch_specs"] + generated_specs
            else:
                args.batch_specs = generated_specs
            console.print(
                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
            )
        elif "batch_specs" in yaml_config:
            args.batch_specs = yaml_config["batch_specs"]
        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
        else:
            args.batch_sizes = None
        # Model config
        if "model" in yaml_config:
            model = yaml_config["model"]
            args.num_layers = model.get("num_layers", args.num_layers)
            args.head_dim = model.get("head_dim", args.head_dim)
            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)
        # Benchmark settings (top-level keys)
        if "device" in yaml_config:
            args.device = yaml_config["device"]
        if "repeats" in yaml_config:
            args.repeats = yaml_config["repeats"]
        if "warmup_iters" in yaml_config:
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
            sweep_config = yaml_config["parameter_sweep"]
            args.parameter_sweep = ParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                include_auto=sweep_config.get("include_auto", False),
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.parameter_sweep = None
        # Model parameter sweep configuration
        if "model_parameter_sweep" in yaml_config:
            sweep_config = yaml_config["model_parameter_sweep"]
            args.model_parameter_sweep = ModelParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.model_parameter_sweep = None
        # Output
        if "output" in yaml_config:
            output = yaml_config["output"]
            if "csv" in output and not args.output_csv:
                args.output_csv = output["csv"]
            if "json" in output and not args.output_json:
                args.output_json = output["json"]
        console.print()
    # Handle CLI-based parameter sweep (if not from YAML)
    if (
        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
        and args.sweep_param
        and args.sweep_values
    ):
        args.parameter_sweep = ParameterSweep(
            param_name=args.sweep_param,
            values=args.sweep_values,
            include_auto=False,
            label_format="{backend}_{param_name}_{value}",
        )
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    console.print(f"Backends: {', '.join(backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print()
    # Run benchmarks
    all_results = []
    # Handle special mode: decode_vs_prefill comparison
    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
        console.print(
            "[dim]For each query length, testing both decode and prefill pipelines[/]"
        )
        console.print("[dim]Using batched execution for optimal performance[/]")
        # Extract batch sizes from config
        batch_sizes = getattr(args, "batch_sizes", [1])
        backend = backends[0]  # Use first backend (should only be one)
        # Calculate total benchmarks
        total = len(batch_sizes)
        with tqdm(total=total, desc="Benchmarking") as pbar:
            for batch_size in batch_sizes:
                # Prepare all configs for this batch size
                configs_with_thresholds = []
                for spec in args.batch_specs:
                    # Parse the batch spec to get query length
                    requests = parse_batch_spec(spec)
                    if not requests:
                        console.print(
                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
                        )
                        continue
                    # Get query length from first request
                    query_length = requests[0].q_len
                    # Create batch spec for this batch size
                    # For batch_size > 1, we need to prepend the count
                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
                    # Create base config (without backend name)
                    base_config = BenchmarkConfig(
                        backend=backend,  # Will be overridden later
                        batch_spec=batch_spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
                    # Add decode pipeline config
                    decode_threshold = query_length
                    config_decode = replace(
                        base_config,
                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
                    )
                    configs_with_thresholds.append((config_decode, decode_threshold))
                    # Add prefill pipeline config if query_length > 1
                    if query_length > 1:
                        prefill_threshold = query_length - 1
                        config_prefill = replace(
                            base_config,
                            backend=f"{backend}_prefill_qlen{query_length}"
                            f"_bs{batch_size}",
                        )
                        configs_with_thresholds.append(
                            (config_prefill, prefill_threshold)
                        )
                # Run all benchmarks for this batch size in one go (batched mode)
                try:
                    from mla_runner import run_mla_benchmark as run_mla
                    # Use batched API: pass list of (config, threshold) tuples
                    timing_results = run_mla(backend, configs_with_thresholds)
                    # Create BenchmarkResult objects from timing results
                    for (config, _), timing in zip(
                        configs_with_thresholds, timing_results
                    ):
                        result = BenchmarkResult(
                            config=config,
                            mean_time=timing["mean"],
                            std_time=timing["std"],
                            min_time=timing["min"],
                            max_time=timing["max"],
                            throughput_tokens_per_sec=timing.get("throughput", None),
                        )
                        all_results.append(result)
                except Exception as e:
                    import traceback
                    console.print(
                        f"[red]Error running batched benchmarks for "
                        f"batch_size={batch_size}: {e}[/]"
                    )
                    console.print("[red]Traceback:[/]")
                    traceback.print_exc()
                    # Add error results for all configs
                    for config, _ in configs_with_thresholds:
                        result = BenchmarkResult(
                            config=config,
                            mean_time=float("inf"),
                            std_time=0,
                            min_time=float("inf"),
                            max_time=float("inf"),
                            error=str(e),
                        )
                        all_results.append(result)
                pbar.update(1)
        # Display decode vs prefill results
        console.print("\n[bold green]Decode vs Prefill Results:[/]")
        # Group by batch size
        by_batch_size = {}
        for r in all_results:
            if r.success:
                # Extract batch size from backend name
                parts = r.config.backend.split("_")
                bs_part = [p for p in parts if p.startswith("bs")]
                if bs_part:
                    bs = int(bs_part[0][2:])
                    if bs not in by_batch_size:
                        by_batch_size[bs] = []
                    by_batch_size[bs].append(r)
        # For each batch size, analyze crossover point
        for bs in sorted(by_batch_size.keys()):
            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
            results = by_batch_size[bs]
            # Group by query length
            by_qlen = {}
            for r in results:
                parts = r.config.backend.split("_")
                qlen_part = [p for p in parts if p.startswith("qlen")]
                if qlen_part:
                    qlen = int(qlen_part[0][4:])
                    if qlen not in by_qlen:
                        by_qlen[qlen] = {}
                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
                    by_qlen[qlen][pipeline] = r
            # Find crossover point
            last_decode_faster = None
            for qlen in sorted(by_qlen.keys()):
                pipelines = by_qlen[qlen]
                if "decode" in pipelines and "prefill" in pipelines:
                    decode_time = pipelines["decode"].mean_time
                    prefill_time = pipelines["prefill"].mean_time
                    faster = "decode" if decode_time < prefill_time else "prefill"
                    speedup = (
                        prefill_time / decode_time
                        if decode_time < prefill_time
                        else decode_time / prefill_time
                    )
                    console.print(
                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
                        f"prefill={prefill_time:.6f}s -> "
                        f"[bold]{faster}[/] ({speedup:.2f}x)"
                    )
                    if faster == "decode":
                        last_decode_faster = qlen
            if last_decode_faster is not None:
                optimal_threshold = last_decode_faster
                console.print(
                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
                    f"{optimal_threshold}[/]"
                )
                console.print(
                    f"  [dim](Use decode pipeline for query_length <= "
                    f"{optimal_threshold})[/]"
                )
            else:
                console.print(
                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
                )
    # Handle model parameter sweep mode
    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
        # Model parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
        }
        all_results = run_model_parameter_sweep(
            backends,
            args.batch_specs,
            base_config_args,
            args.model_parameter_sweep,
            console,
        )
    # Handle parameter sweep mode (unified)
    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
        # Unified parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
        )
    else:
        # Normal mode: compare backends
        total = len(backends) * len(args.batch_specs)
        with tqdm(total=total, desc="Benchmarking") as pbar:
            for spec in args.batch_specs:
                for backend in backends:
                    config = BenchmarkConfig(
                        backend=backend,
                        batch_spec=spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
                    result = run_benchmark(config)
                    all_results.append(result)
                    if not result.success:
                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
                    pbar.update(1)
        # Display results
        console.print("\n[bold green]Results:[/]")
        formatter = ResultsFormatter(console)
        formatter.print_table(all_results, backends)
    # Save results
    if all_results:
        formatter = ResultsFormatter(console)
        if args.output_csv:
            formatter.save_csv(all_results, args.output_csv)
        if args.output_json:
            formatter.save_json(all_results, args.output_json)
 if __name__ == "__main__":
    main()
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -0,0 +1,475 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common utilities for attention benchmarking."""
 import csv
 import json
 import math
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
 from rich.table import Table
 def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
    """
    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
    This ensures results are sorted by batch size first, then query length,
    then sequence length, rather than alphabetically.
    """
    try:
        requests = parse_batch_spec(spec)
        batch_size = len(requests)
        max_q_len = max(r.q_len for r in requests) if requests else 0
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
        # Fallback for unparseable specs
        return (0, 0, 0)
 # Mock classes for vLLM attention infrastructure
 class MockHfConfig:
    """Mock HuggingFace config that satisfies vLLM's requirements."""
    def __init__(self, mla_dims: dict, index_topk: int | None = None):
        self.num_attention_heads = mla_dims["num_q_heads"]
        self.num_key_value_heads = mla_dims["num_kv_heads"]
        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
        self.model_type = "deepseek_v2"
        self.is_encoder_decoder = False
        self.kv_lora_rank = mla_dims["kv_lora_rank"]
        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
        self.v_head_dim = mla_dims["v_head_dim"]
        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
        if index_topk is not None:
            self.index_topk = index_topk
    def get_text_config(self):
        return self
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 except ImportError:
    AttentionLayerBase = object  # Fallback
 class MockKVBProj:
    """Mock KV projection layer for MLA prefill mode.
    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
    """
    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
        self.num_heads = num_heads
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
        Project kv_c_normed to output space.
        Args:
            x: Input tensor [num_tokens, kv_lora_rank]
        Returns:
            Tuple containing output tensor
                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
        """
        num_tokens = x.shape[0]
        result = torch.randn(
            num_tokens,
            self.num_heads,
            self.out_dim,
            device=x.device,
            dtype=x.dtype,
        )
        return (result,)  # Return as tuple to match ColumnParallelLinear API
 class MockIndexer:
    """Mock Indexer for sparse MLA backends.
    Provides topk_indices_buffer that sparse MLA backends use to determine
    which KV cache slots to attend to for each token.
    """
    def __init__(
        self,
        max_num_tokens: int,
        topk_tokens: int,
        device: torch.device,
    ):
        self.topk_tokens = topk_tokens
        self.topk_indices_buffer = torch.zeros(
            (max_num_tokens, topk_tokens),
            dtype=torch.int32,
            device=device,
        )
    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
        """Fill topk_indices_buffer with random valid indices for benchmarking."""
        indices = torch.randint(
            0,
            max_kv_len,
            (num_tokens, self.topk_tokens),
            dtype=torch.int32,
            device=self.topk_indices_buffer.device,
        )
        self.topk_indices_buffer[:num_tokens] = indices
 class MockLayer(AttentionLayerBase):
    """Mock attention layer with scale parameters and impl.
    Inherits from AttentionLayerBase so it passes isinstance checks
    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
    """
    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
        self._k_scale = torch.tensor(1.0, device=device)
        self._v_scale = torch.tensor(1.0, device=device)
        self._q_scale = torch.tensor(1.0, device=device)
        # Scalar floats for kernels that need them
        self._k_scale_float = float(self._k_scale.item())
        self._v_scale_float = float(self._v_scale.item())
        self._q_scale_float = float(self._q_scale.item())
        # AttentionImpl for metadata builders to query
        self.impl = impl
        # KV cache spec for get_kv_cache_spec
        self._kv_cache_spec = kv_cache_spec
    def get_attn_backend(self):
        """Get the attention backend class (required by AttentionLayerBase)."""
        # Return None as this is just a mock layer for benchmarking
        return None
    def get_kv_cache_spec(self):
        """Get the KV cache spec (required by AttentionLayerBase)."""
        return self._kv_cache_spec
@dataclass
 class ParameterSweep:
    """Configuration for sweeping a backend parameter."""
    param_name: str  # Name of the backend parameter to sweep
    values: list[Any]  # List of values to test
    include_auto: bool = False  # Also test with param unset (auto mode)
    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
    def get_label(self, backend: str, value: Any) -> str:
        """Generate a label for a specific parameter value."""
        return self.label_format.format(
            backend=backend, param_name=self.param_name, value=value
        )
@dataclass
 class ModelParameterSweep:
    """Configuration for sweeping a model configuration parameter."""
    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
    values: list[Any]  # List of values to test
    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
    def get_label(self, backend: str, value: Any) -> str:
        """Generate a label for a specific parameter value."""
        return self.label_format.format(
            backend=backend, param_name=self.param_name, value=value
        )
@dataclass
 class BenchmarkConfig:
    """Configuration for a single benchmark run."""
    backend: str
    batch_spec: str
    num_layers: int
    head_dim: int
    num_q_heads: int
    num_kv_heads: int
    block_size: int
    device: str
    dtype: torch.dtype = torch.float16
    repeats: int = 1
    warmup_iters: int = 3
    profile_memory: bool = False
    use_cuda_graphs: bool = False
    # MLA-specific
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
    v_head_dim: int | None = None
    # Backend-specific tuning
    num_kv_splits: int | None = None  # CUTLASS MLA
    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
@dataclass
 class BenchmarkResult:
    """Results from a single benchmark run."""
    config: BenchmarkConfig
    mean_time: float  # seconds
    std_time: float  # seconds
    min_time: float  # seconds
    max_time: float  # seconds
    throughput_tokens_per_sec: float | None = None
    memory_allocated_mb: float | None = None
    memory_reserved_mb: float | None = None
    error: str | None = None
    @property
    def success(self) -> bool:
        """Whether benchmark completed successfully."""
        return self.error is None
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "config": asdict(self.config),
            "mean_time": self.mean_time,
            "std_time": self.std_time,
            "min_time": self.min_time,
            "max_time": self.max_time,
            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
            "memory_allocated_mb": self.memory_allocated_mb,
            "memory_reserved_mb": self.memory_reserved_mb,
            "error": self.error,
        }
 class ResultsFormatter:
    """Format and display benchmark results."""
    def __init__(self, console: Console | None = None):
        self.console = console or Console()
    def print_table(
        self,
        results: list[BenchmarkResult],
        backends: list[str],
        compare_to_fastest: bool = True,
    ):
        """
        Print results as a rich table.
        Args:
            results: List of BenchmarkResult
            backends: List of backend names being compared
            compare_to_fastest: Show percentage comparison to fastest
        """
        # Group by batch spec, preserving first-occurrence order
        by_spec = {}
        specs_order = []
        for r in results:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = {}
                specs_order.append(spec)
            by_spec[spec][r.config.backend] = r
        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
        # Create shortened backend names for display
        def shorten_backend_name(name: str) -> str:
            """Shorten long backend names for table display."""
            # Remove common prefixes
            name = name.replace("flashattn_mla", "famla")
            name = name.replace("flashinfer_mla", "fimla")
            name = name.replace("flashmla", "fmla")
            name = name.replace("cutlass_mla", "cmla")
            name = name.replace("numsplits", "ns")
            return name
        table = Table(title="Attention Benchmark Results")
        table.add_column("Batch\nSpec", no_wrap=True)
        table.add_column("Type", no_wrap=True)
        table.add_column("Batch\nSize", justify="right", no_wrap=True)
        multi = len(backends) > 1
        for backend in backends:
            short_name = shorten_backend_name(backend)
            # Time column
            col_time = f"{short_name}\nTime (s)"
            table.add_column(col_time, justify="right", no_wrap=False)
            if multi and compare_to_fastest:
                # Relative performance column
                col_rel = f"{short_name}\nvs Best"
                table.add_column(col_rel, justify="right", no_wrap=False)
        # Add rows
        for spec in specs_order:
            spec_results = by_spec[spec]
            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
            best_time = min(times.values()) if times else 0.0
            batch_type = get_batch_type(spec)
            batch_size = len(parse_batch_spec(spec))
            row = [spec, batch_type, str(batch_size)]
            for backend in backends:
                if backend in spec_results:
                    r = spec_results[backend]
                    if r.success:
                        row.append(f"{r.mean_time:.6f}")
                        if multi and compare_to_fastest:
                            pct = (
                                (r.mean_time / best_time * 100) if best_time > 0 else 0
                            )
                            pct_str = f"{pct:.1f}%"
                            if r.mean_time == best_time:
                                pct_str = f"[bold green]{pct_str}[/]"
                            row.append(pct_str)
                    else:
                        row.append("[red]ERROR[/]")
                        if multi and compare_to_fastest:
                            row.append("-")
                else:
                    row.append("-")
                    if multi and compare_to_fastest:
                        row.append("-")
            table.add_row(*row)
        self.console.print(table)
    def save_csv(self, results: list[BenchmarkResult], path: str):
        """Save results to CSV file."""
        if not results:
            return
        path_obj = Path(path)
        path_obj.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w", newline="") as f:
            writer = csv.DictWriter(
                f,
                fieldnames=[
                    "backend",
                    "batch_spec",
                    "num_layers",
                    "mean_time",
                    "std_time",
                    "throughput",
                    "memory_mb",
                ],
            )
            writer.writeheader()
            for r in results:
                writer.writerow(
                    {
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
                        "memory_mb": r.memory_allocated_mb or 0,
                    }
                )
        self.console.print(f"[green]Saved CSV results to {path}[/]")
    def save_json(self, results: list[BenchmarkResult], path: str):
        """Save results to JSON file."""
        path_obj = Path(path)
        path_obj.parent.mkdir(parents=True, exist_ok=True)
        data = [r.to_dict() for r in results]
        with open(path, "w") as f:
            json.dump(data, f, indent=2, default=str)
        self.console.print(f"[green]Saved JSON results to {path}[/]")
 def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
    """
    Get MLA dimensions for known models.
    Args:
        model_name: Model identifier
    Returns:
        Dict with MLA dimension configuration
    """
    configs = {
        "deepseek-v2": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 128,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
        "deepseek-v3": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 128,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
        "deepseek-v2-lite": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 16,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
    }
    if model_name not in configs:
        raise ValueError(
            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
        )
    return configs[model_name]
 def get_attention_scale(head_dim: int) -> float:
    """Compute attention scale factor (1/sqrt(d))."""
    return 1.0 / math.sqrt(head_dim)
 def is_mla_backend(backend: str) -> bool:
    """
    Check if backend is an MLA backend using the AttentionBackendEnum.
    Args:
        backend: Backend name matching AttentionBackendEnum exactly
        (e.g., "FLASHMLA_SPARSE")
    Returns:
        True if the backend is an MLA backend, False otherwise
    """
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
        return backend_class.is_mla()
    except (KeyError, ValueError, ImportError, AttributeError):
        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -0,0 +1,70 @@
 # MLA decode-only benchmark configuration
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
  - "16q1s1k"      # 16 requests, 1k KV cache
  - "16q1s2k"      # 16 requests, 2k KV cache
  - "16q1s4k"      # 16 requests, 4k KV cache
  # Medium batches
  - "32q1s1k"      # 32 requests, 1k KV cache
  - "32q1s2k"      # 32 requests, 2k KV cache
  - "32q1s4k"      # 32 requests, 4k KV cache
  - "32q1s8k"      # 32 requests, 8k KV cache
  # Large batches
  - "64q1s1k"      # 64 requests, 1k KV cache
  - "64q1s2k"      # 64 requests, 2k KV cache
  - "64q1s4k"      # 64 requests, 4k KV cache
  - "64q1s8k"      # 64 requests, 8k KV cache
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
  - CUTLASS_MLA
  - FLASHINFER_MLA
  - FLASH_ATTN_MLA  # Hopper only
  - FLASHMLA        # Hopper only
 device: "cuda:0"
 repeats: 100
 warmup_iters: 10
 profile_memory: true
 # Backend-specific tuning
 CUTLASS_MLA:
  num_kv_splits: auto  # or specific value like 4, 8, 16
 FLASH_ATTN_MLA:
  reorder_batch_threshold: 512
 FLASHMLA:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -0,0 +1,60 @@
 # MLA mixed batch benchmark (prefill + decode)
 # Tests chunked prefill performance
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 batch_specs:
  # Small prefill + decode
  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
  # Medium prefill + decode
  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
  # Large prefill + decode (chunked prefill stress test)
  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
  # Context extension + decode
  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
  - "q16k"          # 16k prefill with chunking hint
  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
  # High decode ratio (realistic serving)
  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
 backends:
  - CUTLASS_MLA
  - FLASHINFER_MLA
  - FLASH_ATTN_MLA   # Hopper only
  - FLASHMLA         # Hopper only
 device: "cuda:0"
 repeats: 5
 warmup_iters: 3
 profile_memory: true
 # Analyze chunked prefill workspace size impact
 chunked_prefill:
  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -0,0 +1,62 @@
 # MLA prefill-only benchmark configuration for sparse backends
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Pure prefill
  - "1q512"
  - "1q1k"
  - "1q2k"
  - "1q4k"
  - "1q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Extend
  - "1q512s4k"
  - "1q512s8k"
  - "1q1ks8k"
  - "1q2ks8k"
  - "1q2ks16k"
  - "1q4ks16k"
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 10
 warmup_iters: 3
 profile_memory: true
--- a/Show More
+++ b/Show More
`@@ -1 +1,2 @@`
	`Qwen3-235B-A22B-Instruct-2507-FP8.yaml`	`Qwen3-235B-A22B-Instruct-2507-FP8.yaml`
		`NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml`