[BugFix][Spec Decoding] Fix negative accepted tokens metric crash (#33729 )

Signed-off-by: Nick Hill <nickhill123@gmail.com>
cherry pick
2026-02-03 20:28:32 -05:00 · 2026-02-03 20:28:02 -05:00 · 2026-02-03 20:17:37 -05:00 · 2026-02-03 01:16:42 -08:00 · 2026-02-03 00:24:28 -08:00 · 2026-02-03 00:03:56 -08:00
1710 changed files with 44746 additions and 143495 deletions
--- a/.buildkite/.pipeline_gen_v2
+++ b/.buildkite/.pipeline_gen_v2
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,8 +1,7 @@
 name: vllm_ci
 job_dirs:
-  - ".buildkite/image_build"
  - ".buildkite/test_areas"
-  - ".buildkite/hardware_tests"
+  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,30 +0,0 @@
-group: Hardware - AMD Build 
-steps:
-  - label: "AMD: :docker: build image"
-    key: image-build-amd
-    depends_on: []
-    device: amd_cpu
-    no_plugin: true
-    commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 1
-        - exit_status: -10  # Agent was lost
-          limit: 1
-        - exit_status: 1  # Machine occasionally fail
-          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Ascend NPU Test"
-    soft_fail: true
-    timeout_in_minutes: 20
-    no_plugin: true
-    device: ascend_npu
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -1,100 +0,0 @@
-group: CPU
-depends_on: []
-steps:
- label: CPU-Kernel Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - cmake/cpu_extension.cmake
-  - CMakeLists.txt
-  - vllm/_custom_ops.py
-  - tests/kernels/attention/test_cpu_attn.py
-  - tests/kernels/moe/test_cpu_fused_moe.py
-  - tests/kernels/test_onednn.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-      pytest -x -v -s tests/kernels/test_onednn.py"
-
- label: CPU-Language Generation and Pooling Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/
-  - tests/models/language/generation/
-  - tests/models/language/pooling/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
-      pytest -x -v -s tests/models/language/generation -m cpu_model
-      pytest -x -v -s tests/models/language/pooling -m cpu_model"
-
- label: CPU-Quantization Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
-  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
-  - tests/quantization/test_compressed_tensors.py
-  - tests/quantization/test_cpu_wna16.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
-      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-      
- label: CPU-Distributed Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/shm.cpp
-  - vllm/v1/worker/cpu_worker.py
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/worker/cpu_model_runner.py
-  - vllm/v1/worker/gpu_model_runner.py
-  - vllm/platforms/cpu.py
-  - vllm/distributed/parallel_state.py
-  - vllm/distributed/device_communicators/cpu_communicator.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
-
- label: CPU-Multi-Modal Model Tests %N
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  # - vllm/
-  - vllm/model_executor/layers/rotary_embedding
-  - tests/models/multimodal/generation/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
-      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
-  parallelism: 2
-
- label: "Arm CPU Test"
-  depends_on: []
-  soft_fail: true
-  device: arm_cpu
-  no_plugin: true
-  commands: 
-  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-steps:
-  - label: "GH200 Test"
-    soft_fail: true
-    device: gh200
-    no_plugin: true
-    optional: true
-    commands: 
-    - nvidia-smi 
-    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,17 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Intel HPU Test"
-    soft_fail: true
-    device: intel_hpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,255 +1,56 @@
 #!/bin/bash
-set -euo pipefail
+set -e

-# replace invalid characters in Docker image tags and truncate to 128 chars
-clean_docker_tag() {
-    local input="$1"
-    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
-}
-
-print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
-    exit 1
-}
-
-print_instance_info() {
-    echo ""
-    echo "=== Debug: Instance Information ==="
-    # Get IMDSv2 token
-    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
-        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
-        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
-        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
-        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
-        echo "AMI ID:        ${AMI_ID}"
-        echo "Instance Type: ${INSTANCE_TYPE}"
-        echo "Instance ID:   ${INSTANCE_ID}"
-        echo "AZ:            ${AZ}"
-    else
-        echo "Not running on EC2 or IMDS not available"
-    fi
-    # Check for warm cache AMI (marker file baked into custom AMI)
-    if [[ -f /etc/vllm-ami-info ]]; then
-        echo "Cache:         warm (custom vLLM AMI)"
-        cat /etc/vllm-ami-info
-    else
-        echo "Cache:         cold (standard AMI)"
-    fi
-    echo "==================================="
-    echo ""
-}
-
-setup_buildx_builder() {
-    echo "--- :buildkite: Setting up buildx builder"
-    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
-        # Custom AMI with standalone buildkitd - use remote driver for warm cache
-        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
-        echo "Using remote driver to connect to buildkitd (warm cache available)"
-        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
-            echo "Using existing baked-vllm-builder"
-            docker buildx use baked-vllm-builder
-        else
-            echo "Creating baked-vllm-builder with remote driver"
-            docker buildx create \
-                --name baked-vllm-builder \
-                --driver remote \
-                --use \
-                "unix://${BUILDKIT_SOCKET}"
-        fi
-        docker buildx inspect --bootstrap
-    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
-        # Existing builder available
-        echo "Using existing builder: ${BUILDER_NAME}"
-        docker buildx use "${BUILDER_NAME}"
-        docker buildx inspect --bootstrap
-    else
-        # No local buildkitd, no existing builder - create new docker-container builder
-        echo "No local buildkitd found, using docker-container driver"
-        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
-        docker buildx inspect --bootstrap
-    fi
-
-    # builder info
-    echo "Active builder:"
-    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
-}
-
-check_and_skip_if_image_exists() {
-    if [[ -n "${IMAGE_TAG:-}" ]]; then
-        echo "--- :mag: Checking if image exists"
-        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
-            echo "Image already exists: ${IMAGE_TAG}"
-            echo "Skipping build"
-            exit 0
-        fi
-        echo "Image not found, proceeding with build"
-    fi
-}
-
-ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
-}
-
-prepare_cache_tags() {
-    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
-    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
-    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
-
-    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-            cache="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
-            cache="${TEST_CACHE_ECR}:${clean_branch}"
-        fi
-        CACHE_TO="$cache"
-        CACHE_FROM="$cache"
-        CACHE_FROM_BASE_BRANCH="$cache"
-    else
-        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
-            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
-            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
-        fi
-    fi
-
-    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
-    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
-}
-
-resolve_parent_commit() {
-    if [[ -z "${PARENT_COMMIT:-}" ]]; then
-        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
-        if [[ -n "${PARENT_COMMIT}" ]]; then
-            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
-            export PARENT_COMMIT
-        else
-            echo "Could not determine parent commit (may be first commit in repo)"
-        fi
-    else
-        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
-    fi
-}
-
-print_bake_config() {
-    echo "--- :page_facing_up: Resolved bake configuration"
-    # Write to a temp directory to avoid polluting the repo root (which is the
-    # Docker build context). Files left in the repo root get COPY'd into the
-    # image and can cause duplicate artifact uploads from downstream steps.
-    local bake_tmp
-    bake_tmp="$(mktemp -d)"
-    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
-    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
-    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
-    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
-}
-
-#################################
-#         Main Script           #
-#################################
-print_instance_info
-
-if [[ $# -lt 5 ]]; then
-    print_usage_and_exit
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
 fi

-# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-IMAGE_TAG=$5
-IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8

-# build config
-TARGET="test-ci"
-VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
-BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
-CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
-CI_HCL_PATH="/tmp/ci.hcl"
-BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com

-prepare_cache_tags
-ecr_login
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls

-# Environment info (for docs and human readers)
-#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
-#
-# Build configuration (exported as environment variables for bake):
-export BUILDKITE_COMMIT
-export PARENT_COMMIT
-export IMAGE_TAG
-export IMAGE_TAG_LATEST
-export CACHE_FROM
-export CACHE_FROM_BASE_BRANCH
-export CACHE_FROM_MAIN
-export CACHE_TO
-
-# print args
-echo "--- :mag: Arguments"
-echo "REGISTRY: ${REGISTRY}"
-echo "REPO: ${REPO}"
-echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
-echo "BRANCH: ${BRANCH}"
-echo "IMAGE_TAG: ${IMAGE_TAG}"
-echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
-
-# print build configuration
-echo "--- :mag: Build configuration"
-echo "TARGET: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "BUILDER_NAME: ${BUILDER_NAME}"
-echo "CI_HCL_URL: ${CI_HCL_URL}"
-echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
-
-echo "--- :mag: Cache tags"
-echo "CACHE_TO: ${CACHE_TO}"
-echo "CACHE_FROM: ${CACHE_FROM}"
-echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
-echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
-
-check_and_skip_if_image_exists
-
-echo "--- :docker: Setting up Docker buildx bake"
-echo "Target: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "CI HCL path: ${CI_HCL_PATH}"
-
-if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
-    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
-    echo "Make sure you're running from the vLLM repository root"
-    exit 1
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
 fi

-echo "--- :arrow_down: Downloading ci.hcl"
-curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-echo "Downloaded to ${CI_HCL_PATH}"
-
-if [[ ! -f "${CI_HCL_PATH}" ]]; then
-    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
-    exit 1
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
 fi

-setup_buildx_builder
-
-resolve_parent_commit
-export PARENT_COMMIT
-
-print_bake_config
-
-echo "--- :docker: Building ${TARGET}"
-docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-
-echo "--- :white_check_mark: Build complete"
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,9 +3,8 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
-    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -41,7 +40,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-
+  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -1,15 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.695
-  - name: "exact_match,flexible-extract"
-    value: 0.447
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -1,19 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.7142
-  - name: "exact_match,flexible-extract"
-    value: 0.4579
-env_vars:
-  VLLM_USE_FLASHINFER_MOE_FP8: "1"
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-kv_cache_dtype: fp8
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1,2 +1 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit "$LIMIT"
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -20,11 +20,14 @@ usage() {
    echo
 }

-while getopts "m:l:f:t:" OPT; do
+while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,10 +9,8 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
-from pathlib import Path

 import pandas as pd
-import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -277,131 +275,6 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


-# -----------------------------
-# Export helpers (Excel + CSV)
-# -----------------------------
-def _sanitize_sheet_name(name: str) -> str:
-    """
-    Excel sheet constraints:
-      - max 31 chars
-      - cannot contain: : \ / ? * [ ]
-      - cannot be empty
-    """
-    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
-    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
-    if not name:
-        name = "sheet"
-    return name[:31]
-
-
-def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
-    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
-    model_short = str(model).split("/")[-1]
-    ilen = d.get("Input Len", "")
-    olen = d.get("Output Len", "")
-    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
-    return _sanitize_sheet_name(f"{model_short}{lens}")
-
-
-def _write_tables_to_excel_sheet(
-    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
-):
-    startrow = 0
-    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
-        )
-        startrow += 1
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
-        startrow += len(df) + 3
-
-
-def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
-    return s[:180] if len(s) > 180 else s
-
-
-# -----------------------------
-# vLLM environment export helper
-# -----------------------------
-def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
-    """Parse vllm_env.txt into a flat table (Section, Key, Value).
-
-    Supports:
-      - section headers as standalone lines (no ':' or '=')
-      - key-value lines like 'OS: Ubuntu ...'
-      - env var lines like 'HF_HOME=/data/hf'
-    """
-    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
-    section = "General"
-    rows: list[dict] = []
-
-    def set_section(s: str):
-        nonlocal section
-        s = (s or "").strip()
-        if s:
-            section = s
-
-    for raw in lines:
-        stripped = raw.strip()
-        if not stripped:
-            continue
-        # divider lines like =====
-        if set(stripped) <= {"="}:
-            continue
-
-        # section header heuristic: short standalone line
-        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
-            if stripped.lower().startswith("collecting environment information"):
-                continue
-            set_section(stripped)
-            continue
-
-        # env var style: KEY=VALUE (and not a URL with :)
-        if "=" in stripped and ":" not in stripped:
-            k, v = stripped.split("=", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-        # key: value
-        if ":" in stripped:
-            k, v = stripped.split(":", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
-
-
-def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
-    """Load vllm_env.txt next to the *original* input JSON file.
-
-    Note: when only one -f is provided, the script may split JSON into ./splits/...,
-    but vllm_env.txt typically lives next to the original benchmark_results.json.
-    """
-    base_dir: Path | None = None
-    if getattr(args, "file", None):
-        base_dir = Path(args.file[0]).resolve().parent
-    elif files:
-        base_dir = Path(files[0]).resolve().parent
-    if base_dir is None:
-        return None
-
-    env_path = base_dir / "vllm_env.txt"
-    if not env_path.exists():
-        return None
-    df = _parse_vllm_env_txt(env_path)
-    return df
-
-
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -555,6 +428,7 @@ def build_valid_max_concurrency_summary_html(

    summary_df = pd.DataFrame(rows)

+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -562,10 +436,12 @@ def build_valid_max_concurrency_summary_html(

    both_col = f"Max {conc_col} (Both)"

+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
+        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)
@@ -584,95 +460,6 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


-def build_valid_max_concurrency_summary_df(
-    tput_group_df: pd.DataFrame | None,
-    ttft_group_df: pd.DataFrame | None,
-    tpot_group_df: pd.DataFrame | None,
-    conc_col: str,
-    args,
-) -> pd.DataFrame | None:
-    if ttft_group_df is None and tpot_group_df is None:
-        return None
-
-    ttft_cols = (
-        _config_value_columns(ttft_group_df, conc_col)
-        if ttft_group_df is not None
-        else []
-    )
-    tpot_cols = (
-        _config_value_columns(tpot_group_df, conc_col)
-        if tpot_group_df is not None
-        else []
-    )
-    tput_cols = (
-        _config_value_columns(tput_group_df, conc_col)
-        if tput_group_df is not None
-        else []
-    )
-
-    if ttft_group_df is not None and tpot_group_df is not None:
-        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
-        if tput_group_df is not None:
-            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
-    else:
-        cfg_cols = ttft_cols or tpot_cols
-
-    if not cfg_cols:
-        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
-
-    rows = []
-    for cfg in cfg_cols:
-        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-        both = (
-            pd.NA
-            if (pd.isna(ttft_max) or pd.isna(tpot_max))
-            else min(ttft_max, tpot_max)
-        )
-
-        tput_at_both = (
-            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
-            if tput_group_df is not None
-            else pd.NA
-        )
-        ttft_at_both = (
-            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_at_both = (
-            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-
-        rows.append(
-            {
-                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
-                f"Max {conc_col} (Both)": both,
-                "Output Tput @ Both (tok/s)": tput_at_both,
-                "TTFT @ Both (ms)": ttft_at_both,
-                "TPOT @ Both (ms)": tpot_at_both,
-            }
-        )
-
-    df = pd.DataFrame(rows)
-    for c in df.columns:
-        if c != "Configuration":
-            df[c] = pd.to_numeric(df[c], errors="coerce")
-    return df
-
-
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -750,21 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
-
-    # ---- NEW: export options ----
-    parser.add_argument(
-        "--excel-out",
-        type=str,
-        default="perf_comparison.xlsx",
-        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
-    )
-    parser.add_argument(
-        "--csv-out-dir",
-        type=str,
-        default="",
-        help="If set, write per-group per-metric CSVs into this directory.",
-    )
-
    return parser


@@ -885,6 +657,7 @@ def maybe_write_plot(
        markers=True,
    )

+    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

@@ -957,151 +730,87 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }

-    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
-    if csv_dir:
-        csv_dir.mkdir(parents=True, exist_ok=True)
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
+            )

-    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
-        # ---- Environment sheet (first) ----
-        env_sheet = _sanitize_sheet_name("Environment")
-        env_df = _load_env_df_for_inputs(args, files)
-        if env_df is None or env_df.empty:
-            pd.DataFrame(
-                [
-                    {
-                        "Section": "Environment",
-                        "Key": "vllm_env.txt",
-                        "Value": "NOT FOUND (or empty)",
-                    }
-                ]
-            ).to_excel(xw, sheet_name=env_sheet, index=False)
-        else:
-            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
-        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-            main_fh.write('<meta charset="utf-8">\n')
-            for gkey in group_keys:
-                gkey_tuple = normalize_group_key(gkey)
-                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-                sub_path = group_filename(gkey_tuple)
-                group_header = (
-                    '<div style="font-size: 1.4em; font-weight: 700; '
-                    'margin: 18px 0 10px 0;">'
-                    f"{_html.escape(suffix)}"
-                    "</div>\n"
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis
+
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
+                    )
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
+                    )
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
+                    )
+
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
                )
-
-                main_fh.write(group_header)
-
-                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
-                sheet_base = sheet
-                dedup_i = 1
-                while sheet in xw.sheets:
-                    dedup_i += 1
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
-
-                excel_blocks: list[tuple[str, pd.DataFrame]] = []
-
-                with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                    sub_fh.write('<meta charset="utf-8">\n')
-                    sub_fh.write(group_header)
-                    tput_group_df = None
-                    ttft_group_df = None
-                    tpot_group_df = None
-                    conc_col = args.xaxis
-
-                    for metric_label in plan.data_cols:
-                        gb = metric_groupbys[metric_label]
-                        df_sorted, raw_data_cols = metric_cache[metric_label]
-
-                        try:
-                            group_df = gb.get_group(gkey)
-                        except KeyError:
-                            missing = (
-                                '<div style="font-size: 1.1em; font-weight: 600; '
-                                'margin: 10px 0;">'
-                                f"{_html.escape(metric_label)} — missing for this group"
-                                "</div>\n"
-                            )
-                            main_fh.write(missing)
-                            sub_fh.write(missing)
-                            continue
-
-                        if conc_col not in group_df.columns:
-                            conc_col = _find_concurrency_col(group_df)
-
-                        mn = metric_label.lower().strip()
-                        if "tok/s" in mn:
-                            tput_group_df = group_df
-                        elif "ttft" in mn:
-                            ttft_group_df = group_df
-                        elif mn in ("p99", "median") or "tpot" in mn:
-                            tpot_group_df = group_df
-
-                        display_group = group_df.drop(
-                            columns=group_cols_canonical, errors="ignore"
-                        )
-
-                        html = render_metric_table_html(
-                            display_group, metric_label, suffix, args
-                        )
-                        main_fh.write(html)
-                        sub_fh.write(html)
-
-                        maybe_write_plot(
-                            main_fh,
-                            sub_fh,
-                            group_df=group_df,
-                            raw_data_cols=raw_data_cols,
-                            metric_label=metric_label,
-                            y_axis_col=y_axis_col,
-                            args=args,
-                        )
-
-                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
-                                    "/", "_"
-                                )
-                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                    summary_html = build_valid_max_concurrency_summary_html(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_html:
-                        main_fh.write(summary_html)
-                        sub_fh.write(summary_html)
-
-                    summary_df = build_valid_max_concurrency_summary_df(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_df is not None:
-                        excel_blocks.append(
-                            ("Valid Max Concurrency Summary", summary_df)
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__Valid_Max_Concurrency_Summary"
-                            )
-                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
-
-    print(f"Wrote Excel: {excel_path}")
-    if csv_dir:
-        print(f"Wrote CSVs under: {csv_dir}")
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)


 def main():
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            "performance-benchmarks-descriptions.md"
+            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+
+# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -7,19 +9,14 @@
 set -x
 set -o pipefail

-# Environment-driven debug controls (like ON_CPU=1)
-DRY_RUN="${DRY_RUN:-0}"
-MODEL_FILTER="${MODEL_FILTER:-}"
-DTYPE_FILTER="${DTYPE_FILTER:-}"
-
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -28,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-
+  
  declare -g arch_suffix=''
-
+  
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -47,7 +44,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo "$numa_count"
+    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -115,12 +112,13 @@ json2envs() {
 }

 wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -sf http://localhost:8000/v1/models >/dev/null; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
-    done
-  '
+    done' && return 0 || return 1
 }

 kill_processes_launched_by_current_bash() {
@@ -183,20 +181,19 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-run_benchmark_tests() {
-  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: test type (latency or throughput)
-  # $2: a json file specifying test cases
+run_latency_tests() {
+  # run latency tests using `vllm bench latency` command
+  # $1: a json file specifying latency test cases

-  local test_type=$1
-  local test_file=$2
+  local latency_test_file
+  latency_test_file=$1

-  # Iterate over tests
-  jq -c '.[]' "$test_file" | while read -r params; do
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi

@@ -207,15 +204,15 @@ run_benchmark_tests() {
    fi

    # get arguments
-    bench_params=$(echo "$params" | jq -r '.parameters')
-    bench_args=$(json2args "$bench_params")
-    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    bench_envs=$(json2envs "$bench_environment_variables")
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_envs=$(json2envs "$latency_environment_variables")

    # check if there is enough GPU to run the test
-    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -228,42 +225,118 @@ run_benchmark_tests() {
      fi
    fi

-    bench_command=" $bench_envs vllm bench $test_type \
+    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $bench_args"
+      $latency_args"

    echo "Running test case $test_name"
-    echo "${test_type^} command: $bench_command"
+    echo "Latency command: $latency_command"

-    # recording benchmarking command and GPU command
+    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
-      --arg command "$bench_command" \
+      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
-      --arg test_type "$test_type" \
      '{
-        ($test_type + "_command"): $command,
+        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
-    eval "$bench_command"
+    eval "$latency_command"

    kill_gpu_processes

  done
 }

-run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
+run_throughput_tests() {
+  # run throughput tests using `vllm bench throughput`
+  # $1: a json file specifying throughput test cases

-merge_serving_tests_stream() {
-  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-  local serving_test_file="$1"
-  # shellcheck disable=SC2016
-  local merged='
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    throughput_envs=$(json2envs "$throughput_environment_variables")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    throughput_command=" $throughput_envs vllm bench throughput \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -285,50 +358,7 @@ merge_serving_tests_stream() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  '
-
-  jq -c "$merged" "$serving_test_file" | \
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
-      select((($model|length)==0)
-             or ((.server_parameters.model // "") == $model)
-             or ((.client_parameters.model // "") == $model))
-      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
-    '
-  else
-    cat
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # In dry-run mode, if filters are provided but no tests match, fail fast.
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    local count
-    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
-    if [[ "$count" -eq 0 ]]; then
-      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
-      return 0
-    fi
-  fi
-
-  # Iterate over serving tests (merged + optional filtered stream)
-  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -397,7 +427,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
+    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -408,9 +438,6 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
-    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
-        # dry-run: don't start server
-        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -420,39 +447,34 @@ run_serving_tests() {
      fi
    fi

-    # save the compilation mode and optimization level on the serving results
-    # whenever they are set
-    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
-    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
-
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
        qps="inf"
+        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size, the compilation mode, and the optimization
-        # level to the client so that they can be used on the benchmark dashboard
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
+          --metadata "tensor_parallel_size=$tp" \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        if [[ "${DRY_RUN:-0}" != "1" ]]; then
-          bash -c "$client_command"
-        fi
+        bash -c "$client_command"

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -470,15 +492,12 @@ run_serving_tests() {
    done

    # clean up
-    if [[ "${DRY_RUN:-0}" != "1" ]]; then
-      kill -9 "$server_pid"
-      kill_gpu_processes
-    fi
+    kill -9 $server_pid
+    kill_gpu_processes
  done
 }

 main() {
-
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -488,13 +507,7 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-
-  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    check_hf_token
-  else
-    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
-  fi
+  check_hf_token

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -515,18 +528,12 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
+
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
-
-  if [[ "${DRY_RUN:-0}" == "1" ]]; then
-    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
-    exit 0
-  fi
-
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
-  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

  # postprocess benchmarking results
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -1,41 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      32,
-      64,
-      128
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "jinaai/jina-embeddings-v3",
-      "trust_remote_code": ""
-    },
-    "client_parameters": {
-      "model": "jinaai/jina-embeddings-v3",
-      "backend": "openai-embeddings",
-      "endpoint": "/v1/embeddings",
-      "dataset_name": "sharegpt",
-      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -1,283 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,6 +148,136 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
    }
  ]
 }
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -176,6 +176,23 @@ steps:
        env:
          DOCKER_BUILDKIT: "1"

+      - block: "Build release image for x86_64 ROCm"
+        key: block-rocm-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - ROCm"
+        depends_on: block-rocm-release-image-build
+        id: build-release-image-rocm
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # Build base image first
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+          # Build vLLM ROCm image using the base
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
  - group: "Publish release images"
    key: "publish-release-images"
    steps:
@@ -459,7 +476,7 @@ steps:
      S3_BUCKET: "vllm-wheels"

  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel - x86_64"
+  - label: ":python: Build vLLM ROCm Wheel"
    id: build-rocm-vllm-wheel
    depends_on:
      - step: build-rocm-base-wheels
@@ -649,7 +666,7 @@ steps:
      VARIANT: "rocm700"

  # ROCm Job 5: Build ROCm Release Docker Image
-  - label: ":docker: Build release image - x86_64 - ROCm"
+  - label: ":rocm: :docker: Build ROCm Release Docker Image"
    id: build-rocm-release-image
    depends_on:
      - step: build-rocm-base-wheels
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -27,7 +27,7 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 To download and upload the image:

 \`\`\`
-# Download images:
+Download images:

 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
@@ -35,12 +35,8 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}

-# Tag and push images:
-
-## CUDA
+Tag and push images:

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -66,36 +62,19 @@ docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai:latest-aarch64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130

-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
 docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm

+Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base

-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
-
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
@@ -107,11 +86,5 @@ docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF
+EOF 
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"

 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in

        echo ""
        echo "Downloaded wheels:"
-        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
+        ls -lh artifacts/rocm-base-wheels/

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."

 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap 'rm -f "$PR_DATA"' EXIT
+trap "rm -f $PR_DATA" EXIT

 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:

 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant subdirectory.
+    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.

    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

-    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant subdirectory
+            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant subdirectory
+            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,37 +1,25 @@
 #!/bin/bash

-# This script runs tests inside the corresponding ROCm docker container.
-# It handles both single-node and multi-node test configurations.
-#
-# Multi-node detection: Instead of matching on fragile group names, we detect
-# multi-node jobs structurally by looking for the bracket command syntax
-# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
+# This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

 # Export Python path
 export PYTHONPATH=".."

-###############################################################################
-# Helper Functions
-###############################################################################
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done

-wait_for_clean_gpus() {
-  local timeout=${1:-300}
-  local start=$SECONDS
-  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
-  while true; do
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-      echo "GPUs state is \"clean\""
-      return
-    fi
-    if (( SECONDS - start >= timeout )); then
-      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
-      exit 1
-    fi
-    sleep 3
-  done
-}
+echo "--- ROCm info"
+rocminfo

+# cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -40,12 +28,15 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-
+  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -53,259 +44,201 @@ cleanup_docker() {
  fi
 }

-cleanup_network() {
-  local max_nodes=${NUM_NODES:-2}
-  for node in $(seq 0 $((max_nodes - 1))); do
-    if docker ps -a -q -f name="node${node}" | grep -q .; then
-      docker stop "node${node}" || true
-    fi
-  done
-  if docker network ls | grep -q docker-net; then
-    docker network rm docker-net || true
-  fi
-}
-
-is_multi_node() {
-  local cmds="$1"
-  # Primary signal: NUM_NODES environment variable set by the pipeline
-  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
-    return 0
-  fi
-  # Fallback: detect the bracket syntax structurally
-  # Pattern: [...] && [...] (per-node command arrays)
-  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
-    return 0
-  fi
-  return 1
-}
-
-###############################################################################
-# Pytest marker re-quoting
-#
-# When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around pytest -m marker expressions get stripped:
-#   pytest -v -s -m 'not cpu_test' v1/core
-# becomes:
-#   pytest -v -s -m not cpu_test v1/core
-#
-# pytest then interprets "cpu_test" as a file path, not part of the marker.
-# This function detects unquoted multi-word marker expressions and re-quotes
-# them so they survive the final bash -c expansion.
-###############################################################################
-
-re_quote_pytest_markers() {
-  local cmds="$1"
-  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
-  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
-  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
-  echo "$cmds"
-}
-
-###############################################################################
-# ROCm-specific pytest command rewrites
-#
-# These apply ignore flags and environment overrides for tests that are not
-# yet supported or behave differently on ROCm hardware. Kept as a single
-# function so new exclusions are easy to add in one place.
-###############################################################################
-
-apply_rocm_test_overrides() {
-  local cmds="$1"
-
-  # --- Model registry filter ---
-  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
-    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-  fi
-
-  # --- LoRA: disable custom paged attention ---
-  if [[ $cmds == *"pytest -v -s lora"* ]]; then
-    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-  fi
-
-  # --- Kernel ignores ---
-  if [[ $cmds == *" kernels/core"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/core/test_fused_quant_layernorm.py \
-    --ignore=kernels/core/test_permute_cols.py"
-  fi
-
-  if [[ $cmds == *" kernels/attention"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/attention/test_attention_selector.py \
-    --ignore=kernels/attention/test_encoder_decoder_attn.py \
-    --ignore=kernels/attention/test_flash_attn.py \
-    --ignore=kernels/attention/test_flashinfer.py \
-    --ignore=kernels/attention/test_prefix_prefill.py \
-    --ignore=kernels/attention/test_cascade_flash_attn.py \
-    --ignore=kernels/attention/test_mha_attn.py \
-    --ignore=kernels/attention/test_lightning_attn.py \
-    --ignore=kernels/attention/test_attention.py"
-  fi
-
-  if [[ $cmds == *" kernels/quantization"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/quantization/test_int8_quant.py \
-    --ignore=kernels/quantization/test_machete_mm.py \
-    --ignore=kernels/quantization/test_block_fp8.py \
-    --ignore=kernels/quantization/test_block_int8.py \
-    --ignore=kernels/quantization/test_marlin_gemm.py \
-    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-    --ignore=kernels/quantization/test_int8_kernel.py"
-  fi
-
-  if [[ $cmds == *" kernels/mamba"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/mamba/test_mamba_mixer2.py \
-    --ignore=kernels/mamba/test_causal_conv1d.py \
-    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-  fi
-
-  if [[ $cmds == *" kernels/moe"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
-  fi
-
-  # --- Entrypoint ignores ---
-  if [[ $cmds == *" entrypoints/openai "* ]]; then
-    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
-    --ignore=entrypoints/openai/test_completion.py \
-    --ignore=entrypoints/openai/test_models.py \
-    --ignore=entrypoints/openai/test_lora_adapters.py \
-    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
-    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
-  fi
-
-  if [[ $cmds == *" entrypoints/llm "* ]]; then
-    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
-    --ignore=entrypoints/llm/test_chat.py \
-    --ignore=entrypoints/llm/test_accuracy.py \
-    --ignore=entrypoints/llm/test_init.py \
-    --ignore=entrypoints/llm/test_prompt_validation.py "}
-  fi
-
-  # Clean up escaped newlines from --ignore appends
-  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
-
-  echo "$cmds"
-}
-
-###############################################################################
-# Main
-###############################################################################
-
-# --- GPU initialization ---
-echo "--- Confirming Clean Initial State"
-wait_for_clean_gpus
-
-echo "--- ROCm info"
-rocminfo
-
-# --- Docker housekeeping ---
+# Call the cleanup docker function
 cleanup_docker

 echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-wait_for_clean_gpus

-# --- Pull test image ---
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"

 remove_docker_container() {
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

-# --- Prepare commands ---
 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-commands="$*"
-echo "Raw commands: $commands"
+commands=$@
+echo "Commands:$commands"

-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-commands=$(apply_rocm_test_overrides "$commands")
-echo "Final commands: $commands"
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

-# Verify GPU access
+# Test that we're launching on the machine that has
+# proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi

-# --- Route: multi-node vs single-node ---
-if is_multi_node "$commands"; then
-  echo "--- Multi-node job detected"
-  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-
-  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-  #   BASH_REMATCH[1] = prefix (everything before first bracket)
-  #   BASH_REMATCH[2] = comma-separated node0 commands
-  #   BASH_REMATCH[3] = comma-separated node1 commands
-  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
-    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-    echo "PREFIX: ${prefix}"
-
-    export composite_command="(command rocm-smi || true)"
-    saved_IFS=$IFS
-    IFS=','
-    read -ra node0 <<< "${BASH_REMATCH[2]}"
-    read -ra node1 <<< "${BASH_REMATCH[3]}"
-    IFS=$saved_IFS
-
-    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
-      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  at_least_one_shard_with_tests=0
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    elif [[ ${st} -eq 5 ]]; then
+      echo "Shard exited with status 5 (no tests collected) - treating as success"
+    else # This means st is 0
+      at_least_one_shard_with_tests=1
    fi
-
-    for i in "${!node0[@]}"; do
-      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
-      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
-
-      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-      echo "COMMANDS: ${step_cmd}"
-      composite_command="${composite_command} && ${step_cmd}"
-    done
-
-    /bin/bash -c "${composite_command}"
-    cleanup_network
-  else
-    echo "Multi-node job detected but failed to parse bracket command syntax."
-    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
-    echo "Got: $commands"
-    cleanup_network
-    exit 111
+  done
+  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+    echo "All shards reported no tests collected. Failing the build."
+    exit 1
  fi
 else
-  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    --network=host \
-    --shm-size=16gb \
-    --group-add "$render_gid" \
-    --rm \
-    -e HF_TOKEN \
-    -e AWS_ACCESS_KEY_ID \
-    -e AWS_SECRET_ACCESS_KEY \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e "HF_HOME=${HF_MOUNT}" \
-    -e "PYTHONPATH=${MYPYTHONPATH}" \
-    --name "${container_name}" \
-    "${image_name}" \
-    /bin/bash -c "${commands}"
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,26 +0,0 @@
-#!/bin/bash
-set -euox pipefail
-
-echo "--- PP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
-
-echo "--- DP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,19 +2,119 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -euox pipefail
+set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+# used for TP/PP E2E test
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
-IMAGE_NAME="cpu-test-$NUMA_NODE"
-TIMEOUT_VAL=$1
-TEST_COMMAND=$2

-# building the docker image
-echo "--- :docker: Building Docker image"
-docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+    pytest -x -v -s tests/kernels/test_onednn.py"
+
+  # Run basic model test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    # Note: disable until supports V1
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
+
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/lora/test_qwenvl.py"
+
+  # online serving: tp+pp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,9 +5,7 @@
 set -exuo pipefail

 # Try building the docker image
-image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
-container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t "${image_name}" -f - .
+cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -17,8 +15,7 @@ WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
-RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
+RUN VLLM_TARGET_DEVICE=empty pip install .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -39,20 +36,15 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f "${container_name}" || true; }
+remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name="${container_name}" --network=host \
+docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  -e VLLM_SKIP_WARMUP=true \
-  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-  -e PT_HPU_LAZY_MODE=1 \
-  "${image_name}" \
-  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
-'
+  hpu-plugin-v1-test-env \
+  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,7 +41,6 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
-    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -49,8 +48,9 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
+get_config
 # Check if the function call was successful. If not, exit the script.
-if ! get_config; then
+if [ $? -ne 0 ]; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p "${builder_cache_dir}"
+mkdir -p ${builder_cache_dir}

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t "${image_name}" -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns one argument per line: --device, /dev/davinciX, ...
+#   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
        return 1
    fi

+    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        printf '%s\n' "--device"
-        printf '%s\n' "/dev/davinci${dev_idx}"
+        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
 }

-mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p "${model_cache_dir}"
+mkdir -p ${model_cache_dir}
 docker run \
-    "${device_args[@]}" \
+    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v "${model_cache_dir}":/root/.cache/modelscope \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t "${image_name}" -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
@@ -38,18 +38,15 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam

 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
 # tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,6 +43,7 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -51,14 +52,13 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend "$BACK" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# Nightly e2e test for prefetch offloading with a MoE model.
-# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
-# and validates GSM8K accuracy matches baseline (no offloading).
-#
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-Lite"
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-vllm serve "$MODEL" \
-  --max-model-len 2048 \
-  --offload-group-size 8 \
-  --offload-num-in-group 2 \
-  --offload-prefetch-step 1 \
-  --offload-params w13_weight w2_weight \
-  --port "$PORT" &
-SERVER_PID=$!
-wait_for_server "$PORT"
-
-TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
-python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
-python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
-PY
-
-cleanup
-SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,11 +9,10 @@ ENV_FILE=$1

 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-# shellcheck source=/dev/null
-source "$ENV_FILE"
+source $ENV_FILE

 remove_docker_container() { 
-    docker rm -f "$CONTAINER_NAME" || true;
+    docker rm -f $CONTAINER_NAME || true;
 }

 trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file "$ENV_FILE" \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL="$MODEL" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
- --name "$CONTAINER_NAME" \
+ --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-vllm serve "$MODEL" \
+vllm serve $MODEL \
 --seed 42 \
- --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
- --download_dir "$DOWNLOAD_DIR" \
- --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &


 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for _ in {1..120}; do
+for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model "$MODEL"  \
+    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len "$OUTPUT_LEN" \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"

 echo "completed..."
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -76,15 +76,16 @@ mkdir -p "$INDICES_OUTPUT_DIR"
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-alias_args=()
-if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+    alias_arg=""
 fi

 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg

 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"

 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)

 echo "Release version from Buildkite: $RELEASE_VERSION"

@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR

 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi

-python3 -m twine check "$PYPI_WHEEL_FILES"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true

-WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"

 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi

 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,7 +4,7 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  device: b200
+  gpu: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,15 +17,3 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
-
- label: Attention Benchmarks Smoke Test (B200)
-  device: b200
-  num_gpus: 2
-  optional: true
-  working_dir: "/vllm-workspace/"
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - benchmarks/attention_benchmarks/
-  - vllm/v1/attention/
-  commands:
-  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,200 +2,56 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
-  timeout_in_minutes: 50
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/model_executor/layers/
-  - vllm/compilation/
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - tests/compile/correctness_e2e/test_sequence_parallel.py
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: Sequence Parallel Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: AsyncTP Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
-
- label: Distributed Compile Unit Tests (2xH100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/compilation/
-  - vllm/model_executor/layers
-  - tests/compile/passes/distributed/
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -s -v tests/compile/passes/distributed
-
- label: Fusion and Compile Unit Tests (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
-    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E Quick (H100)
-  timeout_in_minutes: 15
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
-
- label: Fusion E2E Config Sweep (H100)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
-
- label: Fusion E2E Config Sweep (B200)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 1
+  gpu: b200
  optional: true
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
-
- label: Fusion E2E TP2 Quick (H100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
-
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
-
- label: Fusion E2E TP2 (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,7 +9,6 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -16,9 +16,9 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 60
+  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -47,13 +47,14 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -62,7 +63,6 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -97,18 +97,14 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
-  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  # NEW rlhf examples
-  - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  device: h100
-  num_devices: 8
+  gpu: h100
+  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -124,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -137,23 +133,26 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H100)
-  timeout_in_minutes: 15
-  device: h100
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  device: b200
+  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -162,10 +161,8 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  num_nodes: 2
-  no_plugin: true
-  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -174,12 +171,12 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"

 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -187,32 +184,10 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: Pipeline + Context Parallelism (4 GPUs)
+- label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -221,4 +196,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,36 +4,39 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  device: b200
+  gpu: b200
  optional: true
-  num_devices: 2
+  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 60
-  device: h100
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
  optional: true
-  num_devices: 1
+  soft_fail: true
+  num_gpus: 2
  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,16 +23,4 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
-    # need to untangle use (implicit) use of spawn/fork across the tests.
-    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    # Run the rest of v1/engine tests
-    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
+    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -47,13 +42,15 @@ steps:
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
-  - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
  - pytest -v -s tool_use

 - label: Entrypoints Integration (Pooling)
@@ -65,11 +62,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,9 +15,8 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
+  - vllm/attention
  - vllm/v1/attention
-    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -58,8 +57,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  device: h100
-  num_devices: 1
+  gpu: h100
+  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -78,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -86,7 +85,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,45 +114,4 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    # e2e
-    - pytest -v -s tests/models/quantization/test_nvfp4.py
-
- label: Kernels Helion Test
-  timeout_in_minutes: 30
-  device: h100
-  source_file_dependencies:
-  - vllm/utils/import_utils.py
-  - tests/kernels/helion/
-  commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
-
- 
- label: Kernels FP8 MoE Test (1 H100)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 1
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_cutlass_moe.py
-    - pytest -v -s kernels/moe/test_flashinfer.py
-    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
-    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
-    - pytest -v -s kernels/moe/test_moe.py
-    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
-    - pytest -v -s kernels/moe/test_block_int8.py
-    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
-    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
-
- label: Kernels FP8 MoE Test (2 H100s)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 2
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
-    - pytest -v -s kernels/moe/test_deepep_moe.py
-    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
-    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: LM Eval Large Models (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,65 +37,10 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  device: b200
+  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
- label: LM Eval Large Models (H200)
-  timeout_in_minutes: 60
-  device: h200
-  optional: true
-  num_devices: 8
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
-
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
-  
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
-
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
-
- label: GPQA Eval (GPT-OSS) (H100)
-  timeout_in_minutes: 120
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
-
- label: GPQA Eval (GPT-OSS) (B200)
-  timeout_in_minutes: 120
-  device: b200
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,8 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    # TODO: create another `optional` test group for slow tests
-    - pytest -v -s -m 'not slow_test' v1/spec_decode
+    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -26,19 +25,13 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: V1 Others (CPU)
-  depends_on:
-    - image-build-cpu
+  depends_on: ~
  source_file_dependencies:
    - vllm/
    - tests/v1
-  device: cpu
+  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -78,7 +71,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -89,7 +82,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -114,24 +107,19 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
-  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
-  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: 
-  - image-build-cpu
+  depends_on: ~
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -139,13 +127,11 @@ steps:
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  device: cpu
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -153,9 +139,23 @@ steps:
  - pytest -v -s transformers_utils
  - pytest -v -s config

+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
@@ -164,18 +164,4 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
-  
- label: Acceptance Length Test (Large Models) # optional
-  timeout_in_minutes: 25
-  gpu: h100
-  optional: true
-  num_gpus: 1
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/v1/spec_decode/
-  - vllm/model_executor/models/mlp_speculator.py
-  - tests/v1/spec_decode/test_acceptance_length.py
-  commands:
-    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
-    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -31,27 +33,18 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
-  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-    
+    - pytest -v -s models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  device: cpu
+  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -37,7 +40,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -52,21 +56,13 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -76,20 +72,17 @@ steps:

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -14,13 +14,11 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Processor Test (CPU)
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  device: cpu
+  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 10
+  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,16 +17,8 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

- label: PyTorch Compilation Passes Unit Tests
-  timeout_in_minutes: 20
-  source_file_dependencies:
-    - vllm/
-    - tests/compile/passes
-  commands:
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
-
 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 35
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -38,13 +30,16 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 30
+  timeout_in_minutes: 40
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,10 +12,3 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s samplers
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -15,8 +15,8 @@ steps:

 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  device: a100
+  num_gpus: 2
+  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,60 +2,40 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
-/vllm/lora @jeejeelee
-/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
+/vllm/attention @LucasWilkinson
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
+/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @heheda12345
-
-# Entrypoints
-/vllm/entrypoints/anthropic @mgoin @DarkLight1337
-/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
-/vllm/entrypoints/mcp @heheda12345
-/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
-/vllm/entrypoints/pooling @noooop
-/vllm/entrypoints/sagemaker @DarkLight1337
-/vllm/entrypoints/serve @njhill
-/vllm/entrypoints/*.py @njhill
-/vllm/entrypoints/chat_utils.py @DarkLight1337
-/vllm/entrypoints/llm.py @DarkLight1337
-
-# Input/Output Processing
-/vllm/sampling_params.py @njhill @NickLucche
-/vllm/pooling_params.py @noooop @DarkLight1337
-/vllm/tokenizers @DarkLight1337 @njhill
-/vllm/renderers @DarkLight1337 @njhill
-/vllm/reasoning @aarnphm @chaunceyjiang
-/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1/attention @LucasWilkinson @MatthewBonanni
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/worker/gpu/kv_connector.py @orozery
-/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
+/vllm/v1/offloading @ApostaC

 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
@@ -74,13 +54,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC @orozery
-/tests/v1/kv_offload @ApostaC @orozery
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
 /tests/v1/determinism @yewentao256 

 # Transformers modeling backend
@@ -133,8 +113,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
-/vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -170,7 +150,9 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop

 # Security guide and policies
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,7 +19,6 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
-          cache: 'pip'

      - name: Install Python dependencies
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -238,6 +238,3 @@ ep_kernels_workspace/
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
-
-# Ignore generated cpu headers 
-csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,9 +121,24 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: check-forbidden-imports
-    name: Check for forbidden imports
-    entry: python tools/pre_commit/check_forbidden_imports.py
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/pre_commit/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  # forbid directly import triton
+  - id: forbid-direct-triton-import
+    name: "Forbid direct 'import triton'"
+    entry: python tools/pre_commit/check_triton_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
@@ -139,15 +154,6 @@ repos:
    files: ^docker/(Dockerfile|versions\.json)$
    pass_filenames: false
    additional_dependencies: [dockerfile-parse]
-  - id: attention-backend-docs
-    name: Check attention backend documentation is up to date
-    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
-    language: python
-  - id: check-boolean-context-manager
-    name: Check for boolean ops in with-statements
-    entry: python tools/pre_commit/check_boolean_context_manager.py
-    language: python
-    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,14 +9,13 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
-    pre_create_environment:
-      - pip install uv
-    create_environment:
-      - uv venv $READTHEDOCS_VIRTUALENV_PATH
-    install:
-      - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt 
+      - git fetch --unshallow || true

 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
+
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+    - requirements: requirements/docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")

 #
 # Try to find python package with an executable that exactly matches
@@ -293,7 +293,6 @@ set(VLLM_EXT_SRC
  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
-  "csrc/topk.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -434,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS)
+    if (MARLIN_SM75_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -446,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS)
+    if (MARLIN_FP8_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -459,6 +458,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    set(MARLIN_SRCS
+       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/marlin.cu"
       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/marlin/gptq_marlin_repack.cu"
@@ -771,24 +771,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
-    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
-      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
-    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
-  else()
-    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
-                   "in CUDA target architectures.")
-  endif()
-
  # moe_data.cu is used by all CUTLASS MoE kernels.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -1061,7 +1043,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS)
+    if (MARLIN_MOE_SM75_ARCHS) 
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
@@ -1100,27 +1082,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
-
-  # DeepSeek V3 router GEMM kernel - requires SM90+
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
-    set(DSV3_ROUTER_GEMM_SRC
-      "csrc/moe/dsv3_router_gemm_entry.cu"
-      "csrc/moe/dsv3_router_gemm_float_out.cu"
-      "csrc/moe/dsv3_router_gemm_bf16_out.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${DSV3_ROUTER_GEMM_SRC}"
-      CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
-    list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
-    message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
-  else()
-    message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
-                   " (requires SM90+ and CUDA >= 12.0)")
-  endif()
 endif()

 message(STATUS "Enabling moe extension.")
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor

 ## Usage

-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).

 For full CLI reference see:

--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -1,266 +0,0 @@
-# vLLM Attention Benchmarking Suite
-
-Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
-
-## Quick Start
-
-```bash
-cd benchmarks/attention_benchmarks
-
-# Run a pre-configured benchmark
-python benchmark.py --config configs/mla_decode.yaml
-python benchmark.py --config configs/mla_mixed_batch.yaml
-python benchmark.py --config configs/speculative_decode.yaml
-python benchmark.py --config configs/standard_attention.yaml
-python benchmark.py --config configs/reorder_threshold.yaml
-
-# Or run custom benchmarks
-python benchmark.py \
-    --backends flash flashinfer \
-    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
-    --output-csv results.csv
-```
-
-## Simplified Batch Specification Grammar
-
-Express workloads concisely using query length and sequence length:
-
-```python
-"q2k"              # 2048-token prefill (q_len=2048, seq_len=2048)
-"q1s1k"            # Decode: 1 token with 1K sequence
-"8q1s1k"           # 8 decode requests
-"q4s1k"            # 4-token extend (e.g., spec decode)
-"2q2k_32q1s1k"     # Mixed: 2 prefills + 32 decodes
-"16q4s1k"          # 16 spec decode (4 tokens each)
-```
-
-### Grammar Rule
-
-```text
-Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
-
- count:   Number of identical requests (optional, default=1)
- q_len:   Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k':     Multiplies value by 1024
-
-Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
-```
-
-**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
-
-## Pre-configured Benchmarks
-
-The suite includes several pre-configured YAML benchmark configurations:
-
-### MLA Decode Benchmark
-
-Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
-
-```bash
-python benchmark.py --config configs/mla_decode.yaml
-```
-
-### MLA Mixed Batch Benchmark
-
-Tests chunked prefill performance with mixed prefill + decode batches.
-
-```bash
-python benchmark.py --config configs/mla_mixed_batch.yaml
-```
-
-### Speculative Decoding Benchmark
-
-Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
-
-```bash
-python benchmark.py --config configs/speculative_decode.yaml
-```
-
-### Standard Attention Benchmark
-
-Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
-
-```bash
-python benchmark.py --config configs/standard_attention.yaml
-```
-
-### Reorder Threshold Study
-
-**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
-
-Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
-
-```bash
-python benchmark.py --config configs/reorder_threshold.yaml
-```
-
---
-
-## Universal Benchmark
-
-The `benchmark.py` script handles **all** backends - both standard attention and MLA.
-
-### Standard Attention (Flash/Triton/FlashInfer)
-
-```bash
-python benchmark.py \
-    --backends flash triton flashinfer \
-    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
-    --num-layers 10 \
-    --repeats 5 \
-    --output-csv results.csv
-```
-
-### MLA Backends
-
-```bash
-# Compare all MLA backends
-python benchmark.py \
-    --backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
-    --batch-specs "64q1s1k" "64q1s4k" \
-    --output-csv mla_results.csv
-```
-
-### Parameter Sweeps
-
-Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
-
-#### CUTLASS MLA num-splits Optimization
-
-**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
-
-```bash
-python benchmark.py \
-    --backend cutlass_mla \
-    --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
-    --sweep-param num_kv_splits \
-    --sweep-values 1 2 4 8 16 \
-    --output-json optimal_splits.json
-```
-
-#### Reorder Batch Threshold Optimization
-
-**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
-
-```bash
-python benchmark.py \
-    --backend flashmla \
-    --batch-specs "q4s1k" "q8s2k" \
-    --sweep-param reorder_batch_threshold \
-    --sweep-values 1 4 16 64 256 512 \
-    --output-csv threshold_sweep.csv
-```
-
-### All Command-Line Options
-
-```text
--config CONFIG                     # Path to YAML config file (overrides other args)
--backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
-                                    # flashinfer_mla, flashattn_mla, flashmla
--backend BACKEND                   # Single backend (alternative to --backends)
--batch-specs SPEC [SPEC ...]       # Batch specifications using extended grammar
-
-# Model configuration
--num-layers N                      # Number of layers
--head-dim N                        # Head dimension
--num-q-heads N                     # Query heads
--num-kv-heads N                    # KV heads
--block-size N                      # Block size
-
-# Benchmark settings
--device DEVICE                     # Device (default: cuda:0)
--repeats N                         # Repetitions
--warmup-iters N                    # Warmup iterations
--profile-memory                    # Profile memory usage
-
-# Parameter sweeps
--sweep-param PARAM                 # Parameter name to sweep (e.g., num_kv_splits,
-                                    # reorder_batch_threshold)
--sweep-values N [N ...]            # Values to sweep for the parameter
-
-# Output
--output-csv FILE                   # Save to CSV
--output-json FILE                  # Save to JSON
-```
-
-## Hardware Requirements
-
-| Backend | Hardware |
-|---------|----------|
-| Flash/Triton/FlashInfer | Any CUDA GPU |
-| CUTLASS MLA | Blackwell (SM100+) |
-| FlashAttn MLA | Hopper (SM90+) |
-| FlashMLA | Hopper (SM90+) |
-| FlashInfer-MLA | Any CUDA GPU |
-
-## Using MLA Runner Directly
-
-All MLA backends are available through `mla_runner.run_mla_benchmark()`:
-
-```python
-from mla_runner import run_mla_benchmark
-from common import BenchmarkConfig
-
-config = BenchmarkConfig(
-    backend="cutlass_mla",
-    batch_spec="64q1s4k",
-    num_layers=10,
-    head_dim=576,
-    num_q_heads=128,
-    num_kv_heads=1,
-    block_size=128,
-    device="cuda:0",
-    repeats=5,
-    warmup_iters=3,
-)
-
-# CUTLASS MLA with specific num_kv_splits
-result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
-print(f"Time: {result.mean_time:.6f}s")
-
-# FlashInfer-MLA
-result = run_mla_benchmark("flashinfer_mla", config)
-
-# FlashAttn MLA (Hopper SM90+)
-result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
-
-# FlashMLA (Hopper SM90+)
-result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
-```
-
-## Python API
-
-```python
-from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
-from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
-
-# Parse batch specs
-requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
-print(format_batch_spec(requests))
-# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
-
-# Get batch statistics
-stats = get_batch_stats(requests)
-print(f"Total tokens: {stats['total_tokens']}")
-print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
-
-# Format results
-formatter = ResultsFormatter()
-formatter.save_csv(results, "output.csv")
-formatter.save_json(results, "output.json")
-```
-
-## Tips
-
-**1. Warmup matters** - Use `--warmup-iters 10` for stable results
-
-**2. Multiple repeats** - Use `--repeats 20` for low variance
-
-**3. Save results** - Always use `--output-csv` or `--output-json`
-
-**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
-
-**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
-
-**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -1,44 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""vLLM Attention Benchmarking Suite."""
-
-from .batch_spec import (
-    BatchRequest,
-    format_batch_spec,
-    get_batch_stats,
-    parse_batch_spec,
-    reorder_for_flashinfer,
-    split_by_type,
-)
-from .common import (
-    BenchmarkConfig,
-    BenchmarkResult,
-    MockLayer,
-    MockModelConfig,
-    ResultsFormatter,
-    get_attention_scale,
-    is_mla_backend,
-    setup_mla_dims,
-)
-
-__all__ = [
-    # Batch specification
-    "BatchRequest",
-    "parse_batch_spec",
-    "format_batch_spec",
-    "reorder_for_flashinfer",
-    "split_by_type",
-    "get_batch_stats",
-    # Benchmarking infrastructure
-    "BenchmarkConfig",
-    "BenchmarkResult",
-    "ResultsFormatter",
-    # Mock objects
-    "MockLayer",
-    "MockModelConfig",
-    # Utilities
-    "setup_mla_dims",
-    "get_attention_scale",
-    "is_mla_backend",
-]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -1,268 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""
-Simplified batch specification grammar for attention benchmarks.
-
-Grammar (underscore-separated segments):
-  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
-
-  - count: Number of identical requests (optional, default=1)
-  - q_len: Query length (number of new tokens)
-  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
-  - 'k' suffix: Multiplies value by 1024
-
-Common patterns:
-  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
-  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
-  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
-
-Examples:
-  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
-  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
-  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
-  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
-  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
-  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
-"""
-
-from collections import Counter
-from dataclasses import dataclass
-
-import regex as re
-
-
-@dataclass
-class BatchRequest:
-    """Represents a single request in a batch."""
-
-    q_len: int  # Query length (number of new tokens)
-    kv_len: int  # Total KV cache length
-
-    @property
-    def is_decode(self) -> bool:
-        """True if this is a decode request (q_len == 1)."""
-        return self.q_len == 1
-
-    @property
-    def is_prefill(self) -> bool:
-        """True if this is a pure prefill (q_len == kv_len)."""
-        return self.q_len == self.kv_len
-
-    @property
-    def is_extend(self) -> bool:
-        """True if this is context extension (q_len > 1, kv_len > q_len)."""
-        return self.q_len > 1 and self.kv_len > self.q_len
-
-    @property
-    def context_len(self) -> int:
-        """Context length (KV cache - query)."""
-        return self.kv_len - self.q_len
-
-    def as_tuple(self) -> tuple[int, int]:
-        """Return as (q_len, kv_len) tuple for compatibility."""
-        return (self.q_len, self.kv_len)
-
-
-def _parse_size(size_str: str, k_suffix: str) -> int:
-    """Parse size string with optional 'k' suffix."""
-    size = int(size_str)
-    return size * 1024 if k_suffix == "k" else size
-
-
-def parse_batch_spec(spec: str) -> list[BatchRequest]:
-    """
-    Parse batch specification string into list of BatchRequest objects.
-
-    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
-
-    Args:
-        spec: Batch specification string (see module docstring for grammar)
-
-    Returns:
-        List of BatchRequest objects
-
-    Raises:
-        ValueError: If spec format is invalid
-    """
-    requests = []
-
-    for seg in spec.split("_"):
-        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
-        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
-        if m:
-            cnt = int(m.group(1)) if m.group(1) else 1
-            q_len = _parse_size(m.group(2), m.group(3))
-            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
-            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
-            continue
-
-        raise ValueError(f"Invalid batch spec segment: '{seg}'")
-
-    return requests
-
-
-def format_batch_spec(requests: list[BatchRequest]) -> str:
-    """
-    Format list of BatchRequest into human-readable string.
-
-    Groups requests by type and provides counts and sizes.
-
-    Args:
-        requests: List of BatchRequest objects
-
-    Returns:
-        Formatted string describing the batch
-    """
-    kinds = {
-        "prefill": [],
-        "extend": [],
-        "decode": [],
-    }
-
-    for req in requests:
-        tup = (req.q_len, req.kv_len)
-        if req.is_prefill:
-            kinds["prefill"].append(tup)
-        elif req.is_extend:
-            kinds["extend"].append(tup)
-        elif req.is_decode:
-            kinds["decode"].append(tup)
-
-    parts = []
-    for kind in ["prefill", "extend", "decode"]:
-        lst = kinds[kind]
-        if not lst:
-            continue
-
-        cnt_total = len(lst)
-        ctr = Counter(lst)
-        inner = []
-
-        for (q, kv), cnt in ctr.items():
-            if kind == "prefill":
-                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
-                inner.append(f"{cnt}x{size}")
-            elif kind == "decode":
-                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
-                inner.append(f"{cnt}x{size}")
-            else:  # extend
-                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
-                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
-                inner.append(f"{cnt}xq{qstr}kv{kstr}")
-
-        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
-
-    return ", ".join(parts)
-
-
-def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
-    """
-    Reorder requests for FlashInfer: decode first, then prefill.
-
-    FlashInfer expects decode requests before prefill requests for
-    optimal performance.
-
-    Args:
-        requests: Original list of BatchRequest
-
-    Returns:
-        Reordered list with decode requests first
-    """
-    decodes = [r for r in requests if r.is_decode]
-    non_decodes = [r for r in requests if not r.is_decode]
-    return decodes + non_decodes
-
-
-def split_by_type(
-    requests: list[BatchRequest],
-) -> dict[str, list[BatchRequest]]:
-    """
-    Split requests by type for analysis.
-
-    Args:
-        requests: List of BatchRequest
-
-    Returns:
-        Dict with keys: 'decode', 'prefill', 'extend'
-    """
-    result = {
-        "decode": [],
-        "prefill": [],
-        "extend": [],
-    }
-
-    for req in requests:
-        if req.is_decode:
-            result["decode"].append(req)
-        elif req.is_prefill:
-            result["prefill"].append(req)
-        elif req.is_extend:
-            result["extend"].append(req)
-
-    return result
-
-
-def get_batch_stats(requests: list[BatchRequest]) -> dict:
-    """
-    Compute statistics about a batch.
-
-    Args:
-        requests: List of BatchRequest
-
-    Returns:
-        Dict with batch statistics
-    """
-    by_type = split_by_type(requests)
-
-    return {
-        "total_requests": len(requests),
-        "num_decode": len(by_type["decode"]),
-        "num_prefill": len(by_type["prefill"]),
-        "num_extend": len(by_type["extend"]),
-        "total_tokens": sum(r.q_len for r in requests),
-        "total_kv_cache": sum(r.kv_len for r in requests),
-        "max_q_len": max((r.q_len for r in requests), default=0),
-        "max_kv_len": max((r.kv_len for r in requests), default=0),
-        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
-        "avg_kv_len": (
-            sum(r.kv_len for r in requests) / len(requests) if requests else 0
-        ),
-    }
-
-
-def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
-    """
-    Classify a batch spec into a type string.
-
-    Args:
-        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
-        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
-
-    Returns:
-        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
-    """
-    requests = parse_batch_spec(batch_spec)
-
-    # Classify each request
-    types_present = set()
-    for req in requests:
-        if req.is_decode:
-            types_present.add("decode")
-        elif req.is_prefill:
-            types_present.add("prefill")
-        elif req.is_extend:
-            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
-            if req.q_len <= spec_decode_threshold:
-                types_present.add("spec-decode")
-            else:
-                types_present.add("extend")
-
-    if len(types_present) == 1:
-        return types_present.pop()
-    elif len(types_present) > 1:
-        # Sort for consistent output
-        sorted_types = sorted(types_present)
-        return f"mixed ({'+'.join(sorted_types)})"
-    else:
-        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -1,895 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""
-Universal vLLM Attention Benchmark
-
-Benchmark any attention backend with the extended grammar.
-Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
-
-Examples:
-    # Standard attention
-    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
-
-    # MLA backends
-    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
-
-    # Parameter sweep (CLI)
-    python benchmark.py --backend cutlass_mla \
-                        --batch-specs "64q1s1k" \
-                        --sweep-param num_kv_splits \
-                        --sweep-values 1 4 8 16
-
-    # Parameter sweep (YAML config - recommended)
-    python benchmark.py --config configs/cutlass_numsplits.yaml
-"""
-
-import argparse
-import sys
-from dataclasses import replace
-from pathlib import Path
-
-import yaml
-from rich.console import Console
-from tqdm import tqdm
-
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-from batch_spec import parse_batch_spec
-from common import (
-    BenchmarkConfig,
-    BenchmarkResult,
-    ModelParameterSweep,
-    ParameterSweep,
-    ResultsFormatter,
-    batch_spec_sort_key,
-    is_mla_backend,
-)
-
-
-def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
-    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
-    from runner import run_attention_benchmark
-
-    return run_attention_benchmark(config)
-
-
-def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
-    """Run MLA benchmark with appropriate backend."""
-    from mla_runner import run_mla_benchmark as run_mla
-
-    return run_mla(config.backend, config, **kwargs)
-
-
-def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
-    """
-    Run a single benchmark with proper backend selection.
-
-    Args:
-        config: BenchmarkConfig with backend, batch_spec, and model params
-        **kwargs: Additional arguments passed to MLA benchmarks
-
-    Returns:
-        BenchmarkResult (may have error field set on failure)
-    """
-    try:
-        if is_mla_backend(config.backend):
-            return run_mla_benchmark(config, **kwargs)
-        else:
-            return run_standard_attention_benchmark(config)
-    except Exception as e:
-        return BenchmarkResult(
-            config=config,
-            mean_time=float("inf"),
-            std_time=0,
-            min_time=float("inf"),
-            max_time=float("inf"),
-            error=str(e),
-        )
-
-
-def run_model_parameter_sweep(
-    backends: list[str],
-    batch_specs: list[str],
-    base_config_args: dict,
-    sweep: ModelParameterSweep,
-    console: Console,
-) -> list[BenchmarkResult]:
-    """
-    Run model parameter sweep for given backends and batch specs.
-
-    Args:
-        backends: List of backend names
-        batch_specs: List of batch specifications
-        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
-        sweep: ModelParameterSweep configuration
-        console: Rich console for output
-
-    Returns:
-        List of BenchmarkResult objects
-    """
-    all_results = []
-
-    console.print(
-        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
-    )
-
-    total = len(backends) * len(batch_specs) * len(sweep.values)
-
-    with tqdm(total=total, desc="Benchmarking") as pbar:
-        for backend in backends:
-            for spec in batch_specs:
-                for value in sweep.values:
-                    # Create config with modified model parameter
-                    config_args = base_config_args.copy()
-                    config_args[sweep.param_name] = value
-
-                    # Create config with original backend for running
-                    clean_config = BenchmarkConfig(
-                        backend=backend, batch_spec=spec, **config_args
-                    )
-
-                    # Run benchmark
-                    result = run_benchmark(clean_config)
-
-                    # Replace backend with labeled version for display
-                    backend_label = sweep.get_label(backend, value)
-                    labeled_config = replace(result.config, backend=backend_label)
-                    result = replace(result, config=labeled_config)
-                    all_results.append(result)
-
-                    if not result.success:
-                        console.print(
-                            f"[red]Error {backend} {spec} {sweep.param_name}="
-                            f"{value}: {result.error}[/]"
-                        )
-
-                    pbar.update(1)
-
-    # Display sweep results - create separate table for each parameter value
-    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
-    formatter = ResultsFormatter(console)
-
-    # Group results by parameter value and extract backend mapping
-    by_param_value = {}
-    backend_mapping = {}  # Maps labeled backend -> original backend
-
-    for r in all_results:
-        # Extract original backend and param value from labeled backend
-        # The label format is: {backend}_{param_name}_{value}
-        # We need to reverse engineer this
-        labeled_backend = r.config.backend
-
-        # Try each backend to find which one this result belongs to
-        for backend in backends:
-            for value in sweep.values:
-                expected_label = sweep.get_label(backend, value)
-                if labeled_backend == expected_label:
-                    backend_mapping[labeled_backend] = backend
-                    param_value = str(value)
-
-                    if param_value not in by_param_value:
-                        by_param_value[param_value] = []
-                    by_param_value[param_value].append(r)
-                    break
-
-    # Create a table for each parameter value
-    sorted_param_values = sorted(
-        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
-    )
-
-    for param_value in sorted_param_values:
-        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
-        param_results = by_param_value[param_value]
-
-        # Create modified results with original backend names
-        modified_results = []
-        for r in param_results:
-            # Get the original backend name from our mapping
-            original_backend = backend_mapping[r.config.backend]
-            modified_config = replace(r.config, backend=original_backend)
-            modified_result = replace(r, config=modified_config)
-            modified_results.append(modified_result)
-
-        # Print table with original backend names
-        formatter.print_table(modified_results, backends, compare_to_fastest=True)
-
-    # Show optimal backend for each (param_value, batch_spec) combination
-    console.print(
-        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
-    )
-
-    # Group by (param_value, batch_spec)
-    by_param_and_spec = {}
-    for r in all_results:
-        if r.success:
-            # Find which (backend, value) this result corresponds to
-            labeled_backend = r.config.backend
-            for backend in backends:
-                for value in sweep.values:
-                    expected_label = sweep.get_label(backend, value)
-                    if labeled_backend == expected_label:
-                        param_value = str(value)
-                        spec = r.config.batch_spec
-                        key = (param_value, spec)
-
-                        if key not in by_param_and_spec:
-                            by_param_and_spec[key] = []
-                        by_param_and_spec[key].append(r)
-                        break
-
-    # Sort by param value then spec (batch_size, q_len, kv_len)
-    sorted_keys = sorted(
-        by_param_and_spec.keys(),
-        key=lambda x: (
-            int(x[0]) if x[0].isdigit() else x[0],
-            batch_spec_sort_key(x[1]),
-        ),
-    )
-
-    current_param_value = None
-    for param_value, spec in sorted_keys:
-        # Print header when param value changes
-        if param_value != current_param_value:
-            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
-            current_param_value = param_value
-
-        results = by_param_and_spec[(param_value, spec)]
-        best = min(results, key=lambda r: r.mean_time)
-
-        # Extract original backend name using the mapping
-        backend_name = backend_mapping[best.config.backend]
-
-        # Show all backends' times for comparison
-        times_str = " | ".join(
-            [
-                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
-                for r in sorted(results, key=lambda r: r.mean_time)
-            ]
-        )
-
-        console.print(
-            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
-        )
-
-    return all_results
-
-
-def run_parameter_sweep(
-    backends: list[str],
-    batch_specs: list[str],
-    base_config_args: dict,
-    sweep: ParameterSweep,
-    console: Console,
-) -> list[BenchmarkResult]:
-    """
-    Run parameter sweep for given backends and batch specs.
-
-    Args:
-        backends: List of backend names
-        batch_specs: List of batch specifications
-        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
-        sweep: ParameterSweep configuration
-        console: Rich console for output
-
-    Returns:
-        List of BenchmarkResult objects
-    """
-    all_results = []
-
-    # Build list of values to sweep (including auto if requested)
-    sweep_values = list(sweep.values)
-    if sweep.include_auto:
-        sweep_values.append("auto")
-
-    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
-
-    total = len(backends) * len(batch_specs) * len(sweep_values)
-
-    with tqdm(total=total, desc="Benchmarking") as pbar:
-        for backend in backends:
-            for spec in batch_specs:
-                for value in sweep_values:
-                    # Create config with original backend for running
-                    config = BenchmarkConfig(
-                        backend=backend, batch_spec=spec, **base_config_args
-                    )
-
-                    # Prepare kwargs for benchmark runner
-                    kwargs = {}
-                    if value != "auto":
-                        kwargs[sweep.param_name] = value
-
-                    # Run benchmark
-                    result = run_benchmark(config, **kwargs)
-
-                    # Replace backend with labeled version for display
-                    backend_label = sweep.get_label(backend, value)
-                    labeled_config = replace(result.config, backend=backend_label)
-                    result = replace(result, config=labeled_config)
-                    all_results.append(result)
-
-                    if not result.success:
-                        console.print(
-                            f"[red]Error {backend} {spec} {sweep.param_name}="
-                            f"{value}: {result.error}[/]"
-                        )
-
-                    pbar.update(1)
-
-    # Display sweep results
-    console.print("\n[bold green]Sweep Results:[/]")
-    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
-    formatter = ResultsFormatter(console)
-    formatter.print_table(all_results, backend_labels)
-
-    # Show optimal values
-    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
-    by_spec = {}
-    for r in all_results:
-        if r.success:
-            spec = r.config.batch_spec
-            if spec not in by_spec:
-                by_spec[spec] = []
-            by_spec[spec].append(r)
-
-    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
-        results = by_spec[spec]
-        best = min(results, key=lambda r: r.mean_time)
-        console.print(
-            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
-        )
-
-    return all_results
-
-
-def load_config_from_yaml(config_path: str) -> dict:
-    """Load configuration from YAML file."""
-    with open(config_path) as f:
-        return yaml.safe_load(f)
-
-
-def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
-    """
-    Generate batch specs from range specifications.
-
-    Args:
-        ranges: List of range specifications, each containing:
-            - template: Batch spec template (e.g., "q{q_len}kv1k")
-            - q_len: Dict with start, stop, step, end_inclusive (optional)
-            - Other parameters can also be ranges
-
-    Returns:
-        List of generated batch spec strings
-
-    Example:
-        ranges = [
-            {
-                "template": "q{q_len}kv1k",
-                "q_len": {
-                    "start": 1,
-                    "stop": 16,
-                    "step": 1,
-                    "end_inclusive": true  # Optional, defaults to true
-                }
-            }
-        ]
-        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
-    """
-    all_specs = []
-
-    for range_spec in ranges:
-        template = range_spec.get("template")
-        if not template:
-            raise ValueError("Range specification must include 'template'")
-
-        # Extract all range parameters from the spec
-        range_params = {}
-        for key, value in range_spec.items():
-            if key == "template":
-                continue
-            if isinstance(value, dict) and "start" in value:
-                # This is a range specification
-                start = value["start"]
-                stop = value["stop"]
-                step = value.get("step", 1)
-                # Check if end should be inclusive (default: True)
-                end_inclusive = value.get("end_inclusive", True)
-
-                # Adjust stop based on end_inclusive
-                if end_inclusive:
-                    range_params[key] = list(range(start, stop + 1, step))
-                else:
-                    range_params[key] = list(range(start, stop, step))
-            else:
-                # This is a fixed value
-                range_params[key] = [value]
-
-        # Generate all combinations (Cartesian product)
-        if range_params:
-            import itertools
-
-            param_names = list(range_params.keys())
-            param_values = [range_params[name] for name in param_names]
-
-            for values in itertools.product(*param_values):
-                params = dict(zip(param_names, values))
-                spec = template.format(**params)
-                all_specs.append(spec)
-        else:
-            # No parameters, just use template as-is
-            all_specs.append(template)
-
-    return all_specs
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Universal vLLM attention benchmark",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
-    )
-
-    # Config file
-    parser.add_argument(
-        "--config",
-        help="Path to YAML config file (overrides other args)",
-    )
-
-    # Backend selection
-    parser.add_argument(
-        "--backends",
-        nargs="+",
-        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
-        "flashinfer_mla, flashattn_mla, flashmla)",
-    )
-    parser.add_argument(
-        "--backend",
-        help="Single backend (alternative to --backends)",
-    )
-
-    # Batch specifications
-    parser.add_argument(
-        "--batch-specs",
-        nargs="+",
-        default=["q2k", "8q1s1k"],
-        help="Batch specifications using extended grammar",
-    )
-
-    # Model config
-    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
-    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
-    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
-    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
-    parser.add_argument("--block-size", type=int, default=16, help="Block size")
-
-    # Benchmark settings
-    parser.add_argument("--device", default="cuda:0", help="Device")
-    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
-    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
-    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
-
-    # Parameter sweep (use YAML config for advanced sweeps)
-    parser.add_argument(
-        "--sweep-param",
-        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
-    )
-    parser.add_argument(
-        "--sweep-values",
-        type=int,
-        nargs="+",
-        help="Values to sweep for the parameter",
-    )
-
-    # Output
-    parser.add_argument("--output-csv", help="Save to CSV")
-    parser.add_argument("--output-json", help="Save to JSON")
-
-    args = parser.parse_args()
-
-    console = Console()
-    console.print("[bold cyan]vLLM Attention Benchmark[/]")
-
-    # Load config from YAML if provided
-    if args.config:
-        console.print(f"[yellow]Loading config from: {args.config}[/]")
-        yaml_config = load_config_from_yaml(args.config)
-
-        # Show description if available
-        if "description" in yaml_config:
-            console.print(f"[dim]{yaml_config['description']}[/]")
-
-        # Override args with YAML values, but CLI args take precedence
-        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
-
-        # Backend(s) - only use YAML if CLI didn't specify
-        if not cli_backends_provided:
-            if "backend" in yaml_config:
-                args.backend = yaml_config["backend"]
-                args.backends = None
-            elif "backends" in yaml_config:
-                args.backends = yaml_config["backends"]
-                args.backend = None
-
-        # Check for special modes
-        if "mode" in yaml_config:
-            args.mode = yaml_config["mode"]
-        else:
-            args.mode = None
-
-        # Batch specs and sizes
-        # Support both explicit batch_specs and generated batch_spec_ranges
-        if "batch_spec_ranges" in yaml_config:
-            # Generate batch specs from ranges
-            generated_specs = generate_batch_specs_from_ranges(
-                yaml_config["batch_spec_ranges"]
-            )
-            # Combine with any explicit batch_specs
-            if "batch_specs" in yaml_config:
-                args.batch_specs = yaml_config["batch_specs"] + generated_specs
-            else:
-                args.batch_specs = generated_specs
-            console.print(
-                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
-            )
-        elif "batch_specs" in yaml_config:
-            args.batch_specs = yaml_config["batch_specs"]
-
-        if "batch_sizes" in yaml_config:
-            args.batch_sizes = yaml_config["batch_sizes"]
-        else:
-            args.batch_sizes = None
-
-        # Model config
-        if "model" in yaml_config:
-            model = yaml_config["model"]
-            args.num_layers = model.get("num_layers", args.num_layers)
-            args.head_dim = model.get("head_dim", args.head_dim)
-            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
-            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
-            args.block_size = model.get("block_size", args.block_size)
-
-        # Benchmark settings (top-level keys)
-        if "device" in yaml_config:
-            args.device = yaml_config["device"]
-        if "repeats" in yaml_config:
-            args.repeats = yaml_config["repeats"]
-        if "warmup_iters" in yaml_config:
-            args.warmup_iters = yaml_config["warmup_iters"]
-        if "profile_memory" in yaml_config:
-            args.profile_memory = yaml_config["profile_memory"]
-
-        # Parameter sweep configuration
-        if "parameter_sweep" in yaml_config:
-            sweep_config = yaml_config["parameter_sweep"]
-            args.parameter_sweep = ParameterSweep(
-                param_name=sweep_config["param_name"],
-                values=sweep_config["values"],
-                include_auto=sweep_config.get("include_auto", False),
-                label_format=sweep_config.get(
-                    "label_format", "{backend}_{param_name}_{value}"
-                ),
-            )
-        else:
-            args.parameter_sweep = None
-
-        # Model parameter sweep configuration
-        if "model_parameter_sweep" in yaml_config:
-            sweep_config = yaml_config["model_parameter_sweep"]
-            args.model_parameter_sweep = ModelParameterSweep(
-                param_name=sweep_config["param_name"],
-                values=sweep_config["values"],
-                label_format=sweep_config.get(
-                    "label_format", "{backend}_{param_name}_{value}"
-                ),
-            )
-        else:
-            args.model_parameter_sweep = None
-
-        # Output
-        if "output" in yaml_config:
-            output = yaml_config["output"]
-            if "csv" in output and not args.output_csv:
-                args.output_csv = output["csv"]
-            if "json" in output and not args.output_json:
-                args.output_json = output["json"]
-
-        console.print()
-
-    # Handle CLI-based parameter sweep (if not from YAML)
-    if (
-        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
-        and args.sweep_param
-        and args.sweep_values
-    ):
-        args.parameter_sweep = ParameterSweep(
-            param_name=args.sweep_param,
-            values=args.sweep_values,
-            include_auto=False,
-            label_format="{backend}_{param_name}_{value}",
-        )
-
-    # Determine backends
-    backends = args.backends or ([args.backend] if args.backend else ["flash"])
-    console.print(f"Backends: {', '.join(backends)}")
-    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
-    console.print()
-
-    # Run benchmarks
-    all_results = []
-
-    # Handle special mode: decode_vs_prefill comparison
-    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
-        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
-        console.print(
-            "[dim]For each query length, testing both decode and prefill pipelines[/]"
-        )
-        console.print("[dim]Using batched execution for optimal performance[/]")
-
-        # Extract batch sizes from config
-        batch_sizes = getattr(args, "batch_sizes", [1])
-        backend = backends[0]  # Use first backend (should only be one)
-
-        # Calculate total benchmarks
-        total = len(batch_sizes)
-
-        with tqdm(total=total, desc="Benchmarking") as pbar:
-            for batch_size in batch_sizes:
-                # Prepare all configs for this batch size
-                configs_with_thresholds = []
-
-                for spec in args.batch_specs:
-                    # Parse the batch spec to get query length
-                    requests = parse_batch_spec(spec)
-                    if not requests:
-                        console.print(
-                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
-                        )
-                        continue
-
-                    # Get query length from first request
-                    query_length = requests[0].q_len
-
-                    # Create batch spec for this batch size
-                    # For batch_size > 1, we need to prepend the count
-                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
-
-                    # Create base config (without backend name)
-                    base_config = BenchmarkConfig(
-                        backend=backend,  # Will be overridden later
-                        batch_spec=batch_spec,
-                        num_layers=args.num_layers,
-                        head_dim=args.head_dim,
-                        num_q_heads=args.num_q_heads,
-                        num_kv_heads=args.num_kv_heads,
-                        block_size=args.block_size,
-                        device=args.device,
-                        repeats=args.repeats,
-                        warmup_iters=args.warmup_iters,
-                        profile_memory=args.profile_memory,
-                    )
-
-                    # Add decode pipeline config
-                    decode_threshold = query_length
-                    config_decode = replace(
-                        base_config,
-                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
-                    )
-                    configs_with_thresholds.append((config_decode, decode_threshold))
-
-                    # Add prefill pipeline config if query_length > 1
-                    if query_length > 1:
-                        prefill_threshold = query_length - 1
-                        config_prefill = replace(
-                            base_config,
-                            backend=f"{backend}_prefill_qlen{query_length}"
-                            f"_bs{batch_size}",
-                        )
-                        configs_with_thresholds.append(
-                            (config_prefill, prefill_threshold)
-                        )
-
-                # Run all benchmarks for this batch size in one go (batched mode)
-                try:
-                    from mla_runner import run_mla_benchmark as run_mla
-
-                    # Use batched API: pass list of (config, threshold) tuples
-                    timing_results = run_mla(backend, configs_with_thresholds)
-
-                    # Create BenchmarkResult objects from timing results
-                    for (config, _), timing in zip(
-                        configs_with_thresholds, timing_results
-                    ):
-                        result = BenchmarkResult(
-                            config=config,
-                            mean_time=timing["mean"],
-                            std_time=timing["std"],
-                            min_time=timing["min"],
-                            max_time=timing["max"],
-                            throughput_tokens_per_sec=timing.get("throughput", None),
-                        )
-                        all_results.append(result)
-
-                except Exception as e:
-                    import traceback
-
-                    console.print(
-                        f"[red]Error running batched benchmarks for "
-                        f"batch_size={batch_size}: {e}[/]"
-                    )
-                    console.print("[red]Traceback:[/]")
-                    traceback.print_exc()
-                    # Add error results for all configs
-                    for config, _ in configs_with_thresholds:
-                        result = BenchmarkResult(
-                            config=config,
-                            mean_time=float("inf"),
-                            std_time=0,
-                            min_time=float("inf"),
-                            max_time=float("inf"),
-                            error=str(e),
-                        )
-                        all_results.append(result)
-
-                pbar.update(1)
-
-        # Display decode vs prefill results
-        console.print("\n[bold green]Decode vs Prefill Results:[/]")
-
-        # Group by batch size
-        by_batch_size = {}
-        for r in all_results:
-            if r.success:
-                # Extract batch size from backend name
-                parts = r.config.backend.split("_")
-                bs_part = [p for p in parts if p.startswith("bs")]
-                if bs_part:
-                    bs = int(bs_part[0][2:])
-                    if bs not in by_batch_size:
-                        by_batch_size[bs] = []
-                    by_batch_size[bs].append(r)
-
-        # For each batch size, analyze crossover point
-        for bs in sorted(by_batch_size.keys()):
-            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
-            results = by_batch_size[bs]
-
-            # Group by query length
-            by_qlen = {}
-            for r in results:
-                parts = r.config.backend.split("_")
-                qlen_part = [p for p in parts if p.startswith("qlen")]
-                if qlen_part:
-                    qlen = int(qlen_part[0][4:])
-                    if qlen not in by_qlen:
-                        by_qlen[qlen] = {}
-
-                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
-                    by_qlen[qlen][pipeline] = r
-
-            # Find crossover point
-            last_decode_faster = None
-            for qlen in sorted(by_qlen.keys()):
-                pipelines = by_qlen[qlen]
-                if "decode" in pipelines and "prefill" in pipelines:
-                    decode_time = pipelines["decode"].mean_time
-                    prefill_time = pipelines["prefill"].mean_time
-                    faster = "decode" if decode_time < prefill_time else "prefill"
-
-                    speedup = (
-                        prefill_time / decode_time
-                        if decode_time < prefill_time
-                        else decode_time / prefill_time
-                    )
-
-                    console.print(
-                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
-                        f"prefill={prefill_time:.6f}s -> "
-                        f"[bold]{faster}[/] ({speedup:.2f}x)"
-                    )
-
-                    if faster == "decode":
-                        last_decode_faster = qlen
-
-            if last_decode_faster is not None:
-                optimal_threshold = last_decode_faster
-                console.print(
-                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
-                    f"{optimal_threshold}[/]"
-                )
-                console.print(
-                    f"  [dim](Use decode pipeline for query_length <= "
-                    f"{optimal_threshold})[/]"
-                )
-            else:
-                console.print(
-                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
-                )
-
-    # Handle model parameter sweep mode
-    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
-        # Model parameter sweep
-        base_config_args = {
-            "num_layers": args.num_layers,
-            "head_dim": args.head_dim,
-            "num_q_heads": args.num_q_heads,
-            "num_kv_heads": args.num_kv_heads,
-            "block_size": args.block_size,
-            "device": args.device,
-            "repeats": args.repeats,
-            "warmup_iters": args.warmup_iters,
-            "profile_memory": args.profile_memory,
-        }
-        all_results = run_model_parameter_sweep(
-            backends,
-            args.batch_specs,
-            base_config_args,
-            args.model_parameter_sweep,
-            console,
-        )
-
-    # Handle parameter sweep mode (unified)
-    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
-        # Unified parameter sweep
-        base_config_args = {
-            "num_layers": args.num_layers,
-            "head_dim": args.head_dim,
-            "num_q_heads": args.num_q_heads,
-            "num_kv_heads": args.num_kv_heads,
-            "block_size": args.block_size,
-            "device": args.device,
-            "repeats": args.repeats,
-            "warmup_iters": args.warmup_iters,
-            "profile_memory": args.profile_memory,
-        }
-        all_results = run_parameter_sweep(
-            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
-        )
-
-    else:
-        # Normal mode: compare backends
-        total = len(backends) * len(args.batch_specs)
-
-        with tqdm(total=total, desc="Benchmarking") as pbar:
-            for spec in args.batch_specs:
-                for backend in backends:
-                    config = BenchmarkConfig(
-                        backend=backend,
-                        batch_spec=spec,
-                        num_layers=args.num_layers,
-                        head_dim=args.head_dim,
-                        num_q_heads=args.num_q_heads,
-                        num_kv_heads=args.num_kv_heads,
-                        block_size=args.block_size,
-                        device=args.device,
-                        repeats=args.repeats,
-                        warmup_iters=args.warmup_iters,
-                        profile_memory=args.profile_memory,
-                    )
-
-                    result = run_benchmark(config)
-                    all_results.append(result)
-
-                    if not result.success:
-                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
-
-                    pbar.update(1)
-
-        # Display results
-        console.print("\n[bold green]Results:[/]")
-        formatter = ResultsFormatter(console)
-        formatter.print_table(all_results, backends)
-
-    # Save results
-    if all_results:
-        formatter = ResultsFormatter(console)
-        if args.output_csv:
-            formatter.save_csv(all_results, args.output_csv)
-        if args.output_json:
-            formatter.save_json(all_results, args.output_json)
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -1,568 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""Common utilities for attention benchmarking."""
-
-import csv
-import json
-import math
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from batch_spec import get_batch_type, parse_batch_spec
-from rich.console import Console
-from rich.table import Table
-
-
-def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
-    """
-    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
-
-    This ensures results are sorted by batch size first, then query length,
-    then sequence length, rather than alphabetically.
-    """
-    try:
-        requests = parse_batch_spec(spec)
-        batch_size = len(requests)
-        max_q_len = max(r.q_len for r in requests) if requests else 0
-        max_kv_len = max(r.kv_len for r in requests) if requests else 0
-        return (batch_size, max_q_len, max_kv_len)
-    except Exception:
-        # Fallback for unparseable specs
-        return (0, 0, 0)
-
-
-# Mock classes for vLLM attention infrastructure
-
-
-class MockHfConfig:
-    """Mock HuggingFace config that satisfies vLLM's requirements."""
-
-    def __init__(self, mla_dims: dict, index_topk: int | None = None):
-        self.num_attention_heads = mla_dims["num_q_heads"]
-        self.num_key_value_heads = mla_dims["num_kv_heads"]
-        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
-        self.model_type = "deepseek_v2"
-        self.is_encoder_decoder = False
-        self.kv_lora_rank = mla_dims["kv_lora_rank"]
-        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
-        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
-        self.v_head_dim = mla_dims["v_head_dim"]
-        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
-        if index_topk is not None:
-            self.index_topk = index_topk
-
-    def get_text_config(self):
-        return self
-
-
-# Import AttentionLayerBase at module level to avoid circular dependencies
-try:
-    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-
-    _HAS_ATTENTION_LAYER_BASE = True
-except ImportError:
-    _HAS_ATTENTION_LAYER_BASE = False
-    AttentionLayerBase = object  # Fallback
-
-
-class MockKVBProj:
-    """Mock KV projection layer for MLA prefill mode.
-
-    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
-    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
-    """
-
-    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
-        self.num_heads = num_heads
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.v_head_dim = v_head_dim
-        self.out_dim = qk_nope_head_dim + v_head_dim
-
-    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
-        """
-        Project kv_c_normed to output space.
-
-        Args:
-            x: Input tensor [num_tokens, kv_lora_rank]
-
-        Returns:
-            Tuple containing output tensor
-                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
-        """
-        num_tokens = x.shape[0]
-        result = torch.randn(
-            num_tokens,
-            self.num_heads,
-            self.out_dim,
-            device=x.device,
-            dtype=x.dtype,
-        )
-        return (result,)  # Return as tuple to match ColumnParallelLinear API
-
-
-class MockIndexer:
-    """Mock Indexer for sparse MLA backends.
-
-    Provides topk_indices_buffer that sparse MLA backends use to determine
-    which KV cache slots to attend to for each token.
-    """
-
-    def __init__(
-        self,
-        max_num_tokens: int,
-        topk_tokens: int,
-        device: torch.device,
-    ):
-        self.topk_tokens = topk_tokens
-        self.topk_indices_buffer = torch.zeros(
-            (max_num_tokens, topk_tokens),
-            dtype=torch.int32,
-            device=device,
-        )
-
-    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
-        """Fill topk_indices_buffer with random valid indices for benchmarking."""
-        indices = torch.randint(
-            0,
-            max_kv_len,
-            (num_tokens, self.topk_tokens),
-            dtype=torch.int32,
-            device=self.topk_indices_buffer.device,
-        )
-        self.topk_indices_buffer[:num_tokens] = indices
-
-
-class MockLayer(AttentionLayerBase):
-    """Mock attention layer with scale parameters and impl.
-
-    Inherits from AttentionLayerBase so it passes isinstance checks
-    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
-    """
-
-    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
-        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
-        self._k_scale = torch.tensor(1.0, device=device)
-        self._v_scale = torch.tensor(1.0, device=device)
-        self._q_scale = torch.tensor(1.0, device=device)
-        # Scalar floats for kernels that need them
-        self._k_scale_float = float(self._k_scale.item())
-        self._v_scale_float = float(self._v_scale.item())
-        self._q_scale_float = float(self._q_scale.item())
-        # AttentionImpl for metadata builders to query
-        self.impl = impl
-        # KV cache spec for get_kv_cache_spec
-        self._kv_cache_spec = kv_cache_spec
-
-    def get_attn_backend(self):
-        """Get the attention backend class (required by AttentionLayerBase)."""
-        # Return None as this is just a mock layer for benchmarking
-        return None
-
-    def get_kv_cache_spec(self):
-        """Get the KV cache spec (required by AttentionLayerBase)."""
-        return self._kv_cache_spec
-
-
-class MockModelConfig:
-    """Mock model configuration."""
-
-    def __init__(
-        self,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype = torch.float16,
-        max_model_len: int = 32768,
-    ):
-        self._n_q = num_q_heads
-        self._n_kv = num_kv_heads
-        self._d = head_dim
-        self.dtype = dtype
-        self.max_model_len = max_model_len
-
-    def get_num_attention_heads(self, _=None) -> int:
-        return self._n_q
-
-    def get_num_kv_heads(self, _=None) -> int:
-        return self._n_kv
-
-    def get_head_size(self) -> int:
-        return self._d
-
-    def get_num_layers(self) -> int:
-        """Mock method for layer count queries."""
-        return 1
-
-    def get_sliding_window_for_layer(self, _layer_idx: int):
-        """Mock method for sliding window queries."""
-        return None
-
-    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
-        """Mock method for logits soft cap queries."""
-        return None
-
-    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
-        """Mock method for SM scale queries."""
-        return 1.0 / (self.get_head_size() ** 0.5)
-
-
-class MockParallelConfig:
-    """Mock parallel configuration."""
-
-    pass
-
-
-class MockCompilationConfig:
-    """Mock compilation configuration."""
-
-    def __init__(self):
-        self.full_cuda_graph = False
-        self.static_forward_context = {}
-
-
-class MockVLLMConfig:
-    """Mock VLLM configuration."""
-
-    def __init__(self):
-        self.compilation_config = MockCompilationConfig()
-
-
-class MockRunner:
-    """Mock GPU runner for metadata builders."""
-
-    def __init__(
-        self,
-        seq_lens: np.ndarray,
-        query_start_locs: np.ndarray,
-        device: torch.device,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype,
-    ):
-        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
-        self.parallel_config = MockParallelConfig()
-        self.vllm_config = MockVLLMConfig()
-        self.seq_lens_np = seq_lens
-        self.query_start_loc_np = query_start_locs
-        self.device = device
-        self.attention_chunk_size = None
-        self.num_query_heads = num_q_heads
-        self.num_kv_heads = num_kv_heads
-        self.dtype = dtype
-
-
-@dataclass
-class ParameterSweep:
-    """Configuration for sweeping a backend parameter."""
-
-    param_name: str  # Name of the backend parameter to sweep
-    values: list[Any]  # List of values to test
-    include_auto: bool = False  # Also test with param unset (auto mode)
-    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
-
-    def get_label(self, backend: str, value: Any) -> str:
-        """Generate a label for a specific parameter value."""
-        return self.label_format.format(
-            backend=backend, param_name=self.param_name, value=value
-        )
-
-
-@dataclass
-class ModelParameterSweep:
-    """Configuration for sweeping a model configuration parameter."""
-
-    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
-    values: list[Any]  # List of values to test
-    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
-
-    def get_label(self, backend: str, value: Any) -> str:
-        """Generate a label for a specific parameter value."""
-        return self.label_format.format(
-            backend=backend, param_name=self.param_name, value=value
-        )
-
-
-@dataclass
-class BenchmarkConfig:
-    """Configuration for a single benchmark run."""
-
-    backend: str
-    batch_spec: str
-    num_layers: int
-    head_dim: int
-    num_q_heads: int
-    num_kv_heads: int
-    block_size: int
-    device: str
-    dtype: torch.dtype = torch.float16
-    repeats: int = 1
-    warmup_iters: int = 3
-    profile_memory: bool = False
-    use_cuda_graphs: bool = False
-
-    # MLA-specific
-    kv_lora_rank: int | None = None
-    qk_nope_head_dim: int | None = None
-    qk_rope_head_dim: int | None = None
-    v_head_dim: int | None = None
-
-    # Backend-specific tuning
-    num_kv_splits: int | None = None  # CUTLASS MLA
-    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
-
-
-@dataclass
-class BenchmarkResult:
-    """Results from a single benchmark run."""
-
-    config: BenchmarkConfig
-    mean_time: float  # seconds
-    std_time: float  # seconds
-    min_time: float  # seconds
-    max_time: float  # seconds
-    throughput_tokens_per_sec: float | None = None
-    memory_allocated_mb: float | None = None
-    memory_reserved_mb: float | None = None
-    error: str | None = None
-
-    @property
-    def success(self) -> bool:
-        """Whether benchmark completed successfully."""
-        return self.error is None
-
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary for serialization."""
-        return {
-            "config": asdict(self.config),
-            "mean_time": self.mean_time,
-            "std_time": self.std_time,
-            "min_time": self.min_time,
-            "max_time": self.max_time,
-            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
-            "memory_allocated_mb": self.memory_allocated_mb,
-            "memory_reserved_mb": self.memory_reserved_mb,
-            "error": self.error,
-        }
-
-
-class ResultsFormatter:
-    """Format and display benchmark results."""
-
-    def __init__(self, console: Console | None = None):
-        self.console = console or Console()
-
-    def print_table(
-        self,
-        results: list[BenchmarkResult],
-        backends: list[str],
-        compare_to_fastest: bool = True,
-    ):
-        """
-        Print results as a rich table.
-
-        Args:
-            results: List of BenchmarkResult
-            backends: List of backend names being compared
-            compare_to_fastest: Show percentage comparison to fastest
-        """
-        # Group by batch spec, preserving first-occurrence order
-        by_spec = {}
-        specs_order = []
-        for r in results:
-            spec = r.config.batch_spec
-            if spec not in by_spec:
-                by_spec[spec] = {}
-                specs_order.append(spec)
-            by_spec[spec][r.config.backend] = r
-
-        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
-        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
-
-        # Create shortened backend names for display
-        def shorten_backend_name(name: str) -> str:
-            """Shorten long backend names for table display."""
-            # Remove common prefixes
-            name = name.replace("flashattn_mla", "famla")
-            name = name.replace("flashinfer_mla", "fimla")
-            name = name.replace("flashmla", "fmla")
-            name = name.replace("cutlass_mla", "cmla")
-            name = name.replace("numsplits", "ns")
-            return name
-
-        table = Table(title="Attention Benchmark Results")
-        table.add_column("Batch\nSpec", no_wrap=True)
-        table.add_column("Type", no_wrap=True)
-        table.add_column("Batch\nSize", justify="right", no_wrap=True)
-
-        multi = len(backends) > 1
-        for backend in backends:
-            short_name = shorten_backend_name(backend)
-            # Time column
-            col_time = f"{short_name}\nTime (s)"
-            table.add_column(col_time, justify="right", no_wrap=False)
-            if multi and compare_to_fastest:
-                # Relative performance column
-                col_rel = f"{short_name}\nvs Best"
-                table.add_column(col_rel, justify="right", no_wrap=False)
-
-        # Add rows
-        for spec in specs_order:
-            spec_results = by_spec[spec]
-            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
-            best_time = min(times.values()) if times else 0.0
-
-            batch_type = get_batch_type(spec)
-            batch_size = len(parse_batch_spec(spec))
-            row = [spec, batch_type, str(batch_size)]
-            for backend in backends:
-                if backend in spec_results:
-                    r = spec_results[backend]
-                    if r.success:
-                        row.append(f"{r.mean_time:.6f}")
-                        if multi and compare_to_fastest:
-                            pct = (
-                                (r.mean_time / best_time * 100) if best_time > 0 else 0
-                            )
-                            pct_str = f"{pct:.1f}%"
-                            if r.mean_time == best_time:
-                                pct_str = f"[bold green]{pct_str}[/]"
-                            row.append(pct_str)
-                    else:
-                        row.append("[red]ERROR[/]")
-                        if multi and compare_to_fastest:
-                            row.append("-")
-                else:
-                    row.append("-")
-                    if multi and compare_to_fastest:
-                        row.append("-")
-
-            table.add_row(*row)
-
-        self.console.print(table)
-
-    def save_csv(self, results: list[BenchmarkResult], path: str):
-        """Save results to CSV file."""
-        if not results:
-            return
-
-        path_obj = Path(path)
-        path_obj.parent.mkdir(parents=True, exist_ok=True)
-
-        with open(path, "w", newline="") as f:
-            writer = csv.DictWriter(
-                f,
-                fieldnames=[
-                    "backend",
-                    "batch_spec",
-                    "num_layers",
-                    "mean_time",
-                    "std_time",
-                    "throughput",
-                    "memory_mb",
-                ],
-            )
-            writer.writeheader()
-            for r in results:
-                writer.writerow(
-                    {
-                        "backend": r.config.backend,
-                        "batch_spec": r.config.batch_spec,
-                        "num_layers": r.config.num_layers,
-                        "mean_time": r.mean_time,
-                        "std_time": r.std_time,
-                        "throughput": r.throughput_tokens_per_sec or 0,
-                        "memory_mb": r.memory_allocated_mb or 0,
-                    }
-                )
-
-        self.console.print(f"[green]Saved CSV results to {path}[/]")
-
-    def save_json(self, results: list[BenchmarkResult], path: str):
-        """Save results to JSON file."""
-        path_obj = Path(path)
-        path_obj.parent.mkdir(parents=True, exist_ok=True)
-
-        data = [r.to_dict() for r in results]
-        with open(path, "w") as f:
-            json.dump(data, f, indent=2, default=str)
-
-        self.console.print(f"[green]Saved JSON results to {path}[/]")
-
-
-def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
-    """
-    Get MLA dimensions for known models.
-
-    Args:
-        model_name: Model identifier
-
-    Returns:
-        Dict with MLA dimension configuration
-    """
-    configs = {
-        "deepseek-v2": {
-            "kv_lora_rank": 512,
-            "qk_nope_head_dim": 128,
-            "qk_rope_head_dim": 64,
-            "v_head_dim": 128,
-            "num_q_heads": 128,
-            "num_kv_heads": 1,
-            "head_dim": 576,
-        },
-        "deepseek-v3": {
-            "kv_lora_rank": 512,
-            "qk_nope_head_dim": 128,
-            "qk_rope_head_dim": 64,
-            "v_head_dim": 128,
-            "num_q_heads": 128,
-            "num_kv_heads": 1,
-            "head_dim": 576,
-        },
-        "deepseek-v2-lite": {
-            "kv_lora_rank": 512,
-            "qk_nope_head_dim": 128,
-            "qk_rope_head_dim": 64,
-            "v_head_dim": 128,
-            "num_q_heads": 16,
-            "num_kv_heads": 1,
-            "head_dim": 576,
-        },
-    }
-
-    if model_name not in configs:
-        raise ValueError(
-            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
-        )
-
-    return configs[model_name]
-
-
-def get_attention_scale(head_dim: int) -> float:
-    """Compute attention scale factor (1/sqrt(d))."""
-    return 1.0 / math.sqrt(head_dim)
-
-
-def is_mla_backend(backend: str) -> bool:
-    """
-    Check if backend is an MLA backend using the AttentionBackendEnum.
-
-    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-        (e.g., "FLASHMLA_SPARSE")
-
-    Returns:
-        True if the backend is an MLA backend, False otherwise
-    """
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
-
-    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
-        return backend_class.is_mla()
-    except (KeyError, ValueError, ImportError, AttributeError):
-        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -1,70 +0,0 @@
-# MLA decode-only benchmark configuration
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128  # Base value, can be swept for TP simulation
-  num_kv_heads: 1  # MLA uses single latent KV
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
-
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
-
-batch_specs:
-  # Small batches, varying sequence lengths
-  - "16q1s512"     # 16 requests, 512 KV cache
-  - "16q1s1k"      # 16 requests, 1k KV cache
-  - "16q1s2k"      # 16 requests, 2k KV cache
-  - "16q1s4k"      # 16 requests, 4k KV cache
-
-  # Medium batches
-  - "32q1s1k"      # 32 requests, 1k KV cache
-  - "32q1s2k"      # 32 requests, 2k KV cache
-  - "32q1s4k"      # 32 requests, 4k KV cache
-  - "32q1s8k"      # 32 requests, 8k KV cache
-
-  # Large batches
-  - "64q1s1k"      # 64 requests, 1k KV cache
-  - "64q1s2k"      # 64 requests, 2k KV cache
-  - "64q1s4k"      # 64 requests, 4k KV cache
-  - "64q1s8k"      # 64 requests, 8k KV cache
-
-  # Very large batches
-  - "128q1s1k"     # 128 requests, 1k KV cache
-  - "128q1s2k"     # 128 requests, 2k KV cache
-  - "128q1s4k"     # 128 requests, 4k KV cache
-  - "128q1s8k"     # 128 requests, 8k KV cache
-
-  # Long context
-  - "32q1s16k"     # 32 requests, 16k KV cache
-  - "32q1s32k"     # 32 requests, 32k KV cache
-
-backends:
-  - CUTLASS_MLA
-  - FLASHINFER_MLA
-  - FLASH_ATTN_MLA  # Hopper only
-  - FLASHMLA        # Hopper only
-
-device: "cuda:0"
-repeats: 100
-warmup_iters: 10
-profile_memory: true
-
-# Backend-specific tuning
-CUTLASS_MLA:
-  num_kv_splits: auto  # or specific value like 4, 8, 16
-
-FLASH_ATTN_MLA:
-  reorder_batch_threshold: 512
-
-FLASHMLA:
-  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -1,60 +0,0 @@
-# MLA mixed batch benchmark (prefill + decode)
-# Tests chunked prefill performance
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128
-  num_kv_heads: 1
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-  block_size: 128
-
-batch_specs:
-  # Small prefill + decode
-  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
-  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
-  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
-
-  # Medium prefill + decode
-  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
-  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
-  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
-
-  # Large prefill + decode (chunked prefill stress test)
-  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
-  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
-  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
-
-  # Context extension + decode
-  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
-
-  # Explicitly chunked prefill
-  - "q8k"           # 8k prefill with chunking hint
-  - "q16k"          # 16k prefill with chunking hint
-  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
-
-  # High decode ratio (realistic serving)
-  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
-  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
-  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
-
-backends:
-  - CUTLASS_MLA
-  - FLASHINFER_MLA
-  - FLASH_ATTN_MLA   # Hopper only
-  - FLASHMLA         # Hopper only
-
-device: "cuda:0"
-repeats: 5
-warmup_iters: 3
-profile_memory: true
-
-# Analyze chunked prefill workspace size impact
-chunked_prefill:
-  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,62 +0,0 @@
-# MLA prefill-only benchmark configuration for sparse backends
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128
-  num_kv_heads: 1
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-  block_size: 128
-
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
-
-batch_specs:
-  # Pure prefill
-  - "1q512"
-  - "1q1k"
-  - "1q2k"
-  - "1q4k"
-  - "1q8k"
-
-  # Batched pure prefill
-  - "2q512"
-  - "2q1k"
-  - "2q2k"
-  - "2q4k"
-  - "2q8k"
-  - "4q512"
-  - "4q1k"
-  - "4q2k"
-  - "4q4k"
-  - "4q8k"
-  - "8q512"
-  - "8q1k"
-  - "8q2k"
-  - "8q4k"
-  - "8q8k"
-
-  # Extend
-  - "1q512s4k"
-  - "1q512s8k"
-  - "1q1ks8k"
-  - "1q2ks8k"
-  - "1q2ks16k"
-  - "1q4ks16k"
-
-backends:
-  - FLASHMLA_SPARSE
-  - FLASHINFER_MLA_SPARSE
-
-device: "cuda:0"
-repeats: 10
-warmup_iters: 3
-profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -1,87 +0,0 @@
-# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
-# Question: At what query length does prefill pipeline become faster than decode pipeline?
-# Methodology: For each query length, compare decode vs prefill performance to find crossover point
-# Applies to: FlashAttn MLA, FlashMLA
-
-description: "Decode vs Prefill pipeline crossover analysis"
-
-# Test FlashAttn MLA
-backend: FLASH_ATTN_MLA
-
-# Mode: decode_vs_prefill comparison (special sweep mode)
-# For each batch spec, we'll test both decode and prefill pipelines
-mode: "decode_vs_prefill"
-
-# Query lengths to test (from old benchmark_mla_threshold.py methodology)
-# Each query length will be tested with BOTH decode and prefill pipelines:
-#   - decode: threshold >= query_length (forces decode pipeline)
-#   - prefill: threshold < query_length (forces prefill pipeline)
-#
-# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
-# This tests different query lengths with fixed sequence length context
-#
-# Using batch_spec_ranges for automatic generation:
-batch_spec_ranges:
-  - template: "q{q_len}s1k"
-    q_len:
-      start: 1
-      stop: 16
-      step: 1
-      end_inclusive: false
-  - template: "q{q_len}s1k"
-    q_len:
-      start: 16
-      stop: 64
-      step: 2
-      end_inclusive: false
-  - template: "q{q_len}s1k"
-    q_len:
-      start: 64
-      stop: 1024
-      step: 4
-      end_inclusive: true
-
-# Batch sizes to test (from old script)
-batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-
-# Model configuration (DeepSeek V2/V3 defaults)
-model:
-  num_layers: 10
-  head_dim: 576
-  num_q_heads: 128
-  num_kv_heads: 1
-  block_size: 128
-
-# Benchmark settings
-device: "cuda:0"
-repeats: 15          # More repeats for spec decode variance
-warmup_iters: 5
-profile_memory: false
-
-# Output
-output:
-  csv: "reorder_threshold_results.csv"
-  json: "reorder_threshold_results.json"
-
-# Expected outcome (reproduces old benchmark_mla_threshold.py study):
-# - For each batch size, find the crossover point where prefill becomes faster than decode
-# - Show decode vs prefill performance across all query lengths
-# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
-# - Understand how crossover point varies with batch size
-# - Provide data-driven guidance for default threshold value
-#
-# Methodology (from old script):
-# - Each query length tested with BOTH pipelines:
-#     * decode: threshold >= query_length (forces decode pipeline)
-#     * prefill: threshold < query_length (forces prefill pipeline)
-# - Compare which is faster to find crossover point
-#
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -1,61 +0,0 @@
-# Speculative decoding benchmark configuration
-# Tests reorder_batch_threshold optimization
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128
-  num_kv_heads: 1
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-
-batch_specs:
-  # Pure speculative decode (K-token verification)
-  - "q2s1k"      # 2-token spec, 1k KV
-  - "q4s1k"      # 4-token spec, 1k KV
-  - "q8s1k"      # 8-token spec, 1k KV
-  - "q16s1k"     # 16-token spec, 1k KV
-
-  # Speculative with different context lengths
-  - "q4s2k"      # 4-token spec, 2k KV
-  - "q4s4k"      # 4-token spec, 4k KV
-  - "q8s2k"      # 8-token spec, 2k KV
-  - "q8s4k"      # 8-token spec, 4k KV
-
-  # Mixed: speculative + regular decode
-  - "32q4s1k"                    # 32 spec requests
-  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
-  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
-
-  # Mixed: speculative + prefill + decode
-  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
-  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
-
-  # Large batches with speculation
-  - "64q4s1k"                    # 64 spec requests
-  - "32q8s2k"                    # 32 spec (8-token)
-  - "16q16s4k"                   # 16 spec (16-token)
-
-# Backends that support query length > 1
-backends:
-  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
-  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
-
-# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - FLASHINFER_MLA
-
-# Benchmark settings
-device: "cuda:0"
-repeats: 10  # More repeats for statistical significance
-warmup_iters: 5
-profile_memory: false
-
-# Test these threshold values for optimization
-parameter_sweep:
-  param_name: "reorder_batch_threshold"
-  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
-  include_auto: false
-  label_format: "{backend}_threshold_{value}"
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -1,48 +0,0 @@
-# Standard attention backend benchmark configuration
-
-model:
-  num_layers: 32
-  num_q_heads: 32
-  num_kv_heads: 8  # GQA with 4:1 ratio
-  head_dim: 128
-  block_size: 16
-
-batch_specs:
-  # Pure prefill
-  - "q512"      # Small prefill (512 tokens)
-  - "q2k"       # Medium prefill (2048 tokens)
-  - "q4k"       # Large prefill (4096 tokens)
-  - "q8k"       # Very large prefill (8192 tokens)
-
-  # Pure decode
-  - "8q1s1k"      # 8 requests, 1k KV cache each
-  - "16q1s2k"     # 16 requests, 2k KV cache each
-  - "32q1s1k"     # 32 requests, 1k KV cache each
-  - "64q1s4k"     # 64 requests, 4k KV cache each
-
-  # Mixed prefill/decode
-  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
-  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
-  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
-
-  # Speculative decode (q <= 8)
-  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
-  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
-  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
-  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
-  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
-
-  # Context extension (chunked prefill)
-  - "q1ks2k"          # 1k query, 2k sequence
-  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
-
-# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
-backends:
-  - FLASH_ATTN
-  - TRITON_ATTN
-  - FLASHINFER
-
-device: "cuda:0"
-repeats: 5
-warmup_iters: 3
-profile_memory: false
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -1,891 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""
-MLA benchmark runner - shared utilities for MLA benchmarks.
-
-This module provides helpers for running MLA backends without
-needing full VllmConfig integration.
-"""
-
-import numpy as np
-import torch
-from batch_spec import parse_batch_spec
-from common import (
-    BenchmarkResult,
-    MockHfConfig,
-    MockIndexer,
-    MockKVBProj,
-    MockLayer,
-    setup_mla_dims,
-)
-
-from vllm.config import (
-    CacheConfig,
-    CompilationConfig,
-    ModelConfig,
-    ParallelConfig,
-    SchedulerConfig,
-    VllmConfig,
-    set_current_vllm_config,
-)
-
-# ============================================================================
-# VllmConfig Creation
-# ============================================================================
-
-
-def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
-    """
-    Add mock methods for layer-specific queries to ModelConfig.
-
-    These methods are needed by metadata builders but aren't normally
-    present on ModelConfig when used in benchmark contexts.
-    """
-    import types
-
-    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
-    model_config.get_sliding_window_for_layer = types.MethodType(
-        lambda self, _i: None, model_config
-    )
-    model_config.get_logits_soft_cap_for_layer = types.MethodType(
-        lambda self, _i: None, model_config
-    )
-    model_config.get_sm_scale_for_layer = types.MethodType(
-        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
-    )
-
-
-def create_minimal_vllm_config(
-    model_name: str = "deepseek-v3",
-    block_size: int = 128,
-    max_num_seqs: int = 256,
-    mla_dims: dict | None = None,
-    index_topk: int | None = None,
-) -> VllmConfig:
-    """
-    Create minimal VllmConfig for MLA benchmarks.
-
-    Args:
-        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
-                    provided
-        block_size: KV cache block size
-        max_num_seqs: Maximum number of sequences
-        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
-                  setup_mla_dims(model_name)
-        index_topk: Optional topk value for sparse MLA backends. If provided,
-                    the config will include index_topk for sparse attention.
-
-    Returns:
-        VllmConfig for benchmarking
-    """
-    # Get MLA dimensions - use provided or load from model name
-    if mla_dims is None:
-        mla_dims = setup_mla_dims(model_name)
-
-    # Create mock HF config first (avoids downloading from HuggingFace)
-    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
-
-    # Create a temporary minimal config.json to avoid HF downloads
-    # This ensures consistent ModelConfig construction without network access
-    import json
-    import os
-    import shutil
-    import tempfile
-
-    minimal_config = {
-        "architectures": ["DeepseekV2ForCausalLM"],
-        "model_type": "deepseek_v2",
-        "num_attention_heads": mla_dims["num_q_heads"],
-        "num_key_value_heads": mla_dims["num_kv_heads"],
-        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
-        "torch_dtype": "bfloat16",
-        "max_position_embeddings": 163840,  # DeepSeek V3 default
-        "rope_theta": 10000.0,
-        "vocab_size": 128256,
-    }
-
-    # Create temporary directory with config.json
-    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
-    config_path = os.path.join(temp_dir, "config.json")
-    with open(config_path, "w") as f:
-        json.dump(minimal_config, f)
-
-    try:
-        # Create model config using local path - no HF downloads
-        model_config = ModelConfig(
-            model=temp_dir,  # Use local temp directory
-            tokenizer=None,
-            tokenizer_mode="auto",
-            trust_remote_code=True,
-            dtype="bfloat16",
-            seed=0,
-            max_model_len=32768,
-            quantization=None,
-            enforce_eager=False,
-            max_logprobs=20,
-            disable_sliding_window=False,
-            skip_tokenizer_init=True,
-            served_model_name=None,
-            limit_mm_per_prompt=None,
-            config_format="auto",
-        )
-    finally:
-        # Clean up temporary directory
-        shutil.rmtree(temp_dir, ignore_errors=True)
-
-    # Override with our mock config
-    model_config.hf_config = mock_hf_config
-    model_config.hf_text_config = mock_hf_config
-
-    # Add mock methods for layer-specific queries
-    _add_mock_methods_to_model_config(model_config)
-
-    # Create sub-configs
-    cache_config = CacheConfig(
-        block_size=block_size,
-        gpu_memory_utilization=0.9,
-        swap_space=0,
-        cache_dtype="auto",
-        enable_prefix_caching=False,
-    )
-
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=8192,
-        max_model_len=32768,
-        is_encoder_decoder=False,
-        enable_chunked_prefill=True,
-    )
-
-    parallel_config = ParallelConfig(
-        tensor_parallel_size=1,
-    )
-
-    compilation_config = CompilationConfig()
-
-    return VllmConfig(
-        model_config=model_config,
-        cache_config=cache_config,
-        parallel_config=parallel_config,
-        scheduler_config=scheduler_config,
-        compilation_config=compilation_config,
-    )
-
-
-# ============================================================================
-# Backend Configuration
-# ============================================================================
-
-
-# Backend-specific properties that can't be inferred from the backend class
-# Keys are AttentionBackendEnum names (uppercase)
-_BACKEND_PROPERTIES = {
-    "FLASHMLA": {
-        "query_format": "concat",  # Single concatenated tensor (vs tuple)
-    },
-    "FLASHMLA_SPARSE": {
-        "query_format": "concat",  # Single concatenated tensor (vs tuple)
-    },
-}
-
-
-def _get_backend_config(backend: str) -> dict:
-    """
-    Get backend configuration from AttentionBackendEnum.
-
-    Uses the registry to get the backend class and extract configuration
-    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
-
-    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-        (e.g., "FLASHMLA_SPARSE")
-
-    Returns:
-        Dict with backend configuration
-    """
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
-
-    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
-    except (KeyError, ValueError) as e:
-        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
-        raise ValueError(
-            f"Unknown backend: {backend}. "
-            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
-        ) from e
-
-    # Get block size from backend class
-    block_sizes = backend_class.get_supported_kernel_block_sizes()
-    # Use first supported block size (backends typically support one for MLA)
-    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
-        # Handle MultipleOf enum
-        block_size = None
-
-    # Check if sparse via class method if available
-    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
-
-    # Get properties that can't be inferred
-    props = _BACKEND_PROPERTIES.get(backend, {})
-
-    return {
-        "backend_class": backend_class,
-        "impl_class": backend_class.get_impl_cls(),
-        "builder_class": backend_class.get_builder_cls(),
-        "query_format": props.get("query_format", "tuple"),
-        "block_size": block_size,
-        "is_sparse": is_sparse,
-    }
-
-
-# ============================================================================
-# Metadata Building Helpers
-# ============================================================================
-
-
-def _build_attention_metadata(
-    requests: list,
-    block_size: int,
-    device: torch.device,
-    builder_instance,
-) -> tuple:
-    """
-    Build attention metadata from batch requests.
-
-    Args:
-        requests: List of BatchRequest objects
-        block_size: KV cache block size
-        device: Target device
-        builder_instance: Metadata builder instance
-
-    Returns:
-        Tuple of (metadata, kv_cache_num_blocks)
-    """
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
-    max_kv = max(kv_lens)
-
-    # Build query start locations
-    q_start_cpu = torch.tensor(
-        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
-        dtype=torch.int32,
-    )
-    q_start_gpu = q_start_cpu.to(device)
-
-    # Build sequence lengths
-    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
-    seq_lens_gpu = seq_lens_cpu.to(device)
-
-    # Build num_computed_tokens (context length for each request)
-    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
-    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
-
-    # Build block table
-    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
-    max_num_blocks = max(num_blocks_per_req)
-
-    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
-    current_block = 0
-    for i, num_blocks in enumerate(num_blocks_per_req):
-        for j in range(num_blocks):
-            block_table_cpu[i, j] = current_block
-            current_block += 1
-
-    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-    # Build slot mapping
-    slot_mapping_list = []
-    for i, (q_len, kv_len, num_blocks) in enumerate(
-        zip(q_lens, kv_lens, num_blocks_per_req)
-    ):
-        context_len = kv_len - q_len
-        for j in range(q_len):
-            token_kv_idx = context_len + j
-            block_idx = token_kv_idx // block_size
-            offset_in_block = token_kv_idx % block_size
-            global_block_id = block_table_cpu[i, block_idx]
-            slot_id = global_block_id * block_size + offset_in_block
-            slot_mapping_list.append(slot_id)
-
-    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
-
-    # Create CommonAttentionMetadata
-    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-
-    common_attn_metadata = CommonAttentionMetadata(
-        num_reqs=len(requests),
-        max_query_len=max(q_lens),
-        max_seq_len=max_kv,
-        num_actual_tokens=total_q,
-        query_start_loc=q_start_gpu,
-        query_start_loc_cpu=q_start_cpu,
-        seq_lens=seq_lens_gpu,
-        _seq_lens_cpu=seq_lens_cpu,
-        _num_computed_tokens_cpu=num_computed_tokens_cpu,
-        slot_mapping=slot_mapping,
-        block_table_tensor=block_table_gpu,
-        dcp_local_seq_lens=None,
-    )
-
-    # Use the production build() method
-    metadata = builder_instance.build(
-        common_prefix_len=0,
-        common_attn_metadata=common_attn_metadata,
-        fast_build=False,
-    )
-
-    return metadata, current_block
-
-
-def _create_input_tensors(
-    total_q: int,
-    mla_dims: dict,
-    query_format: str,
-    device: torch.device,
-    dtype: torch.dtype,
-):
-    """
-    Create input tensors for both decode and prefill modes.
-
-    MLA requires different tensor formats for decode vs prefill:
-    - Decode: Uses kv_lora_rank (512) dimension
-    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
-
-    Args:
-        total_q: Total number of query tokens
-        mla_dims: MLA dimension configuration
-        query_format: Either "tuple" or "concat"
-        device: Target device
-        dtype: Tensor dtype
-
-    Returns:
-        Tuple of (decode_inputs, prefill_inputs)
-        - decode_inputs: Query tensor(s) for decode mode
-        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
-    """
-    if query_format == "tuple":
-        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
-        q_nope_decode = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"],
-            device=device,
-            dtype=dtype,
-        )
-        q_pe = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=dtype,
-        )
-        decode_inputs = (q_nope_decode, q_pe)
-
-        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
-        q_nope_prefill = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["qk_nope_head_dim"],
-            device=device,
-            dtype=dtype,
-        )
-        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
-    else:  # concat
-        decode_inputs = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=dtype,
-        )
-        # For prefill with concat format
-        prefill_q = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=dtype,
-        )
-
-    # Create additional inputs needed for prefill forward
-    k_c_normed = torch.randn(
-        total_q,
-        mla_dims["kv_lora_rank"],
-        device=device,
-        dtype=dtype,
-    )
-    k_pe = torch.randn(
-        total_q,
-        1,  # Single head for MLA
-        mla_dims["qk_rope_head_dim"],
-        device=device,
-        dtype=dtype,
-    )
-    k_scale = torch.ones(1, device=device, dtype=torch.float32)
-
-    output = torch.zeros(
-        total_q,
-        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
-        device=device,
-        dtype=dtype,
-    )
-
-    prefill_inputs = {
-        "q": prefill_q,
-        "k_c_normed": k_c_normed,
-        "k_pe": k_pe,
-        "k_scale": k_scale,
-        "output": output,
-    }
-
-    return decode_inputs, prefill_inputs
-
-
-# ============================================================================
-# Backend Initialization
-# ============================================================================
-
-
-def _create_backend_impl(
-    backend_cfg: dict,
-    mla_dims: dict,
-    vllm_config: VllmConfig,
-    device: torch.device,
-    max_num_tokens: int = 8192,
-    index_topk: int | None = None,
-):
-    """
-    Create backend implementation instance.
-
-    Args:
-        backend_cfg: Backend configuration dict from _get_backend_config()
-        mla_dims: MLA dimension configuration
-        vllm_config: VllmConfig instance
-        device: Target device
-        max_num_tokens: Maximum number of tokens for sparse indexer buffer
-        index_topk: Topk value for sparse MLA backends
-
-    Returns:
-        Tuple of (impl, layer, builder_instance, indexer)
-    """
-    # Get classes from backend config (already resolved by _get_backend_config)
-    impl_class = backend_cfg["impl_class"]
-    builder_class = backend_cfg["builder_class"]
-
-    # Calculate scale
-    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
-
-    # Create mock kv_b_proj layer for prefill mode
-    mock_kv_b_proj = MockKVBProj(
-        num_heads=mla_dims["num_q_heads"],
-        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-        v_head_dim=mla_dims["v_head_dim"],
-    )
-
-    # Create indexer for sparse backends
-    indexer = None
-    if backend_cfg.get("is_sparse", False):
-        if index_topk is None:
-            index_topk = 2048  # Default topk for sparse MLA
-        indexer = MockIndexer(
-            max_num_tokens=max_num_tokens,
-            topk_tokens=index_topk,
-            device=device,
-        )
-
-    # Build impl kwargs
-    impl_kwargs = {
-        "num_heads": mla_dims["num_q_heads"],
-        "head_size": mla_dims["head_dim"],
-        "scale": scale,
-        "num_kv_heads": mla_dims["num_kv_heads"],
-        "alibi_slopes": None,
-        "sliding_window": None,
-        "kv_cache_dtype": "auto",
-        "logits_soft_cap": None,
-        "attn_type": "decoder",
-        "kv_sharing_target_layer_name": None,
-        "q_lora_rank": None,
-        "kv_lora_rank": mla_dims["kv_lora_rank"],
-        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
-        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
-        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-        "v_head_dim": mla_dims["v_head_dim"],
-        "kv_b_proj": mock_kv_b_proj,
-    }
-
-    # Add indexer for sparse backends
-    if indexer is not None:
-        impl_kwargs["indexer"] = indexer
-
-    # Create impl
-    impl = impl_class(**impl_kwargs)
-
-    # Initialize DCP attributes
-    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
-        impl.dcp_world_size = 1
-        impl.dcp_rank = 0
-
-    # Create KV cache spec for MockLayer
-    from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-    kv_cache_spec = FullAttentionSpec(
-        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
-        num_kv_heads=1,  # MLA uses 1 KV head
-        head_size=576,  # MLA head dim
-        dtype=torch.bfloat16,
-    )
-
-    # Create mock layer
-    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
-
-    # Create builder instance if needed
-    builder_instance = None
-    if builder_class:
-        # Populate static_forward_context so builder can find the layer
-        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
-        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
-
-        builder_instance = builder_class(
-            kv_cache_spec=kv_cache_spec,
-            layer_names=["placeholder"],
-            vllm_config=vllm_config,
-            device=device,
-        )
-
-    return impl, layer, builder_instance, indexer
-
-
-# ============================================================================
-# Config Helpers
-# ============================================================================
-
-
-def _extract_mla_dims_from_config(config) -> dict | None:
-    """
-    Extract MLA dimensions from BenchmarkConfig if all required fields are present.
-
-    Args:
-        config: BenchmarkConfig instance
-
-    Returns:
-        Dict with MLA dimensions if all fields are provided, None otherwise
-    """
-    # Check if all MLA-specific fields are provided
-    if all(
-        [
-            config.kv_lora_rank is not None,
-            config.qk_nope_head_dim is not None,
-            config.qk_rope_head_dim is not None,
-            config.v_head_dim is not None,
-        ]
-    ):
-        return {
-            "kv_lora_rank": config.kv_lora_rank,
-            "qk_nope_head_dim": config.qk_nope_head_dim,
-            "qk_rope_head_dim": config.qk_rope_head_dim,
-            "v_head_dim": config.v_head_dim,
-            "num_q_heads": config.num_q_heads,
-            "num_kv_heads": config.num_kv_heads,
-            "head_dim": config.head_dim,
-        }
-    # Fallback: if MLA fields not fully specified, try to construct from basic fields
-    elif config.head_dim == 576:
-        # This looks like a DeepSeek MLA config, use standard dimensions with custom
-        # head count
-        return {
-            "kv_lora_rank": 512,
-            "qk_nope_head_dim": 128,
-            "qk_rope_head_dim": 64,
-            "v_head_dim": 128,
-            "num_q_heads": config.num_q_heads,
-            "num_kv_heads": config.num_kv_heads,
-            "head_dim": config.head_dim,
-        }
-    return None
-
-
-# ============================================================================
-# Benchmark Execution
-# ============================================================================
-
-
-def _run_single_benchmark(
-    config,
-    impl,
-    layer,
-    builder_instance,
-    backend_cfg: dict,
-    mla_dims: dict,
-    device: torch.device,
-    indexer=None,
-) -> BenchmarkResult:
-    """
-    Run a single benchmark iteration.
-
-    Args:
-        config: BenchmarkConfig instance
-        impl: Backend implementation instance
-        layer: MockLayer instance
-        builder_instance: Metadata builder instance
-        backend_cfg: Backend configuration dict
-        mla_dims: MLA dimension configuration
-        device: Target device
-        indexer: Optional MockIndexer for sparse backends
-
-    Returns:
-        BenchmarkResult with timing statistics
-    """
-    # Parse batch spec
-    requests = parse_batch_spec(config.batch_spec)
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
-    max_kv_len = max(kv_lens)
-
-    # Determine block size
-    block_size = backend_cfg["block_size"] or config.block_size
-
-    # Build metadata
-    metadata, num_blocks = _build_attention_metadata(
-        requests, block_size, device, builder_instance
-    )
-
-    # Create KV cache
-    kv_cache = torch.zeros(
-        num_blocks,
-        block_size,
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-        device=device,
-        dtype=torch.bfloat16,
-    )
-
-    # Create input tensors for both decode and prefill modes
-    decode_inputs, prefill_inputs = _create_input_tensors(
-        total_q,
-        mla_dims,
-        backend_cfg["query_format"],
-        device,
-        torch.bfloat16,
-    )
-
-    # Fill indexer with random indices for sparse backends
-    is_sparse = backend_cfg.get("is_sparse", False)
-    if is_sparse and indexer is not None:
-        indexer.fill_random_indices(total_q, max_kv_len)
-
-    # Determine which forward method to use
-    if is_sparse:
-        # Sparse backends use forward_mqa
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
-    elif metadata.decode is not None:
-        forward_fn = lambda: impl._forward_decode(
-            decode_inputs, kv_cache, metadata, layer
-        )
-    elif metadata.prefill is not None:
-        forward_fn = lambda: impl._forward_prefill(
-            prefill_inputs["q"],
-            prefill_inputs["k_c_normed"],
-            prefill_inputs["k_pe"],
-            kv_cache,
-            metadata,
-            prefill_inputs["k_scale"],
-            prefill_inputs["output"],
-        )
-    else:
-        raise RuntimeError("Metadata has neither decode nor prefill metadata")
-
-    # Warmup
-    for _ in range(config.warmup_iters):
-        forward_fn()
-    torch.cuda.synchronize()
-
-    # Benchmark
-    times = []
-    for _ in range(config.repeats):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-
-        start.record()
-        for _ in range(config.num_layers):
-            forward_fn()
-        end.record()
-
-        torch.cuda.synchronize()
-        elapsed_ms = start.elapsed_time(end)
-        times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-    mean_time = float(np.mean(times))
-    return BenchmarkResult(
-        config=config,
-        mean_time=mean_time,
-        std_time=float(np.std(times)),
-        min_time=float(np.min(times)),
-        max_time=float(np.max(times)),
-        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
-    )
-
-
-def _run_mla_benchmark_batched(
-    backend: str,
-    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
-    index_topk: int = 2048,
-) -> list[BenchmarkResult]:
-    """
-    Unified batched MLA benchmark runner for all backends.
-
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-               flashinfer_mla_sparse, flashmla_sparse
-
-    This function reuses backend initialization across multiple benchmarks
-    to avoid setup/teardown overhead.
-
-    Args:
-        backend: Backend name
-        configs_with_params: List of (config, threshold, num_splits) tuples
-            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
-            - num_splits: num_kv_splits (CUTLASS only)
-        index_topk: Topk value for sparse MLA backends (default 2048)
-
-    Returns:
-        List of BenchmarkResult objects
-    """
-    if not configs_with_params:
-        return []
-
-    backend_cfg = _get_backend_config(backend)
-    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
-
-    # Determine block size
-    config_block_size = configs_with_params[0][0].block_size
-    block_size = backend_cfg["block_size"] or config_block_size
-
-    # Extract MLA dimensions from the first config
-    first_config = configs_with_params[0][0]
-    mla_dims = _extract_mla_dims_from_config(first_config)
-
-    # If config didn't provide MLA dims, fall back to default model
-    if mla_dims is None:
-        mla_dims = setup_mla_dims("deepseek-v3")
-
-    # Determine if this is a sparse backend
-    is_sparse = backend_cfg.get("is_sparse", False)
-
-    # Create and set vLLM config for MLA (reused across all benchmarks)
-    vllm_config = create_minimal_vllm_config(
-        model_name="deepseek-v3",  # Used only for model path
-        block_size=block_size,
-        mla_dims=mla_dims,  # Use custom dims from config or default
-        index_topk=index_topk if is_sparse else None,
-    )
-
-    results = []
-
-    with set_current_vllm_config(vllm_config):
-        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
-        impl, layer, builder_instance, indexer = _create_backend_impl(
-            backend_cfg,
-            mla_dims,
-            vllm_config,
-            device,
-            index_topk=index_topk if is_sparse else None,
-        )
-
-        # Run each benchmark with the shared impl
-        for config, threshold, num_splits in configs_with_params:
-            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
-            original_threshold = None
-            if threshold is not None and builder_instance:
-                original_threshold = builder_instance.reorder_batch_threshold
-                builder_instance.reorder_batch_threshold = threshold
-
-            # Set num_splits for CUTLASS
-            original_num_splits = None
-            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
-                original_num_splits = impl._num_kv_splits
-                impl._num_kv_splits = num_splits
-
-            try:
-                result = _run_single_benchmark(
-                    config,
-                    impl,
-                    layer,
-                    builder_instance,
-                    backend_cfg,
-                    mla_dims,
-                    device,
-                    indexer=indexer,
-                )
-                results.append(result)
-
-            finally:
-                # Restore original threshold
-                if original_threshold is not None:
-                    builder_instance.reorder_batch_threshold = original_threshold
-
-                # Restore original num_splits
-                if original_num_splits is not None:
-                    impl._num_kv_splits = original_num_splits
-
-    return results
-
-
-# ============================================================================
-# Public API
-# ============================================================================
-
-
-def run_mla_benchmark(
-    backend: str,
-    config,
-    reorder_batch_threshold: int | None = None,
-    num_kv_splits: int | None = None,
-    index_topk: int = 2048,
-) -> BenchmarkResult | list[BenchmarkResult]:
-    """
-    Unified MLA benchmark runner for all backends.
-
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-               flashinfer_mla_sparse, flashmla_sparse
-
-    Always uses batched execution internally for optimal performance.
-
-    Args:
-        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-                 flashinfer_mla_sparse, flashmla_sparse)
-        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
-        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
-                                 (single config mode only)
-        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
-        index_topk: Topk value for sparse MLA backends (default 2048)
-
-    Returns:
-        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
-    """
-    # Normalize to batched mode: (config, threshold, num_splits)
-    if isinstance(config, list):
-        # Already in batched format
-        if len(config) > 0 and isinstance(config[0], tuple):
-            # Format: [(cfg, param), ...] where param is threshold or num_splits
-            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
-                configs_with_params = [(cfg, param, None) for cfg, param in config]
-            else:  # cutlass_mla, flashinfer_mla, or sparse backends
-                configs_with_params = [(cfg, None, param) for cfg, param in config]
-        else:
-            # Format: [cfg, ...] - just configs
-            configs_with_params = [(cfg, None, None) for cfg in config]
-        return_single = False
-    else:
-        # Single config: convert to batched format
-        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
-        return_single = True
-
-    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
-
-    # Return single result or list based on input
-    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -1,539 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""
-Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
-
-This module provides helpers for running standard attention backends
-(FlashAttention, Triton, FlashInfer) with real vLLM integration.
-"""
-
-import logging
-import types
-from contextlib import contextmanager
-
-import numpy as np
-import torch
-from batch_spec import parse_batch_spec, reorder_for_flashinfer
-from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
-
-from vllm.config import (
-    CacheConfig,
-    CompilationConfig,
-    DeviceConfig,
-    LoadConfig,
-    ModelConfig,
-    ParallelConfig,
-    SchedulerConfig,
-    VllmConfig,
-    set_current_vllm_config,
-)
-from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    get_kv_cache_layout,
-    set_kv_cache_layout,
-)
-from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-# ============================================================================
-# Backend Configuration
-# ============================================================================
-
-
-def _get_backend_config(backend: str) -> dict:
-    """
-    Get backend configuration from AttentionBackendEnum.
-
-    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-                 (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
-
-    Returns:
-        Dict with backend_class
-    """
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
-
-    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
-    except (KeyError, ValueError) as e:
-        valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
-        raise ValueError(
-            f"Unknown backend: {backend}. Valid backends: {valid_backends}"
-        ) from e
-
-    return {"backend_class": backend_class}
-
-
-@contextmanager
-def log_warnings_and_errors_only():
-    """Temporarily set vLLM logger to WARNING level."""
-    logger = logging.getLogger("vllm")
-    old_level = logger.level
-    logger.setLevel(logging.WARNING)
-    try:
-        yield
-    finally:
-        logger.setLevel(old_level)
-
-
-# ============================================================================
-# Metadata Building Helpers
-# ============================================================================
-
-
-def _build_common_attn_metadata(
-    q_lens: list[int],
-    kv_lens: list[int],
-    block_size: int,
-    device: torch.device,
-) -> CommonAttentionMetadata:
-    """Build CommonAttentionMetadata from query/kv lengths."""
-    batch_size = len(q_lens)
-    total_tokens = sum(q_lens)
-
-    query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
-    query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
-        0
-    )
-    query_start_loc_cpu = query_start_loc.cpu()
-
-    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
-    max_seq_len = int(seq_lens.max().item())
-
-    max_blocks = (max(kv_lens) + block_size - 1) // block_size
-    num_blocks = batch_size * max_blocks
-    block_table_tensor = torch.arange(
-        num_blocks, dtype=torch.int32, device=device
-    ).view(batch_size, max_blocks)
-    slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
-
-    max_query_len = max(q_lens)
-
-    return CommonAttentionMetadata(
-        query_start_loc=query_start_loc,
-        query_start_loc_cpu=query_start_loc_cpu,
-        seq_lens=seq_lens,
-        num_reqs=batch_size,
-        num_actual_tokens=total_tokens,
-        max_query_len=max_query_len,
-        max_seq_len=max_seq_len,
-        block_table_tensor=block_table_tensor,
-        slot_mapping=slot_mapping,
-        causal=True,
-    )
-
-
-def _create_vllm_config(
-    config: BenchmarkConfig,
-    max_num_blocks: int,
-) -> VllmConfig:
-    """Create a VllmConfig for benchmarking with mock model methods."""
-    model_config = ModelConfig(
-        model="meta-llama/Meta-Llama-3-8B",
-        tokenizer="meta-llama/Meta-Llama-3-8B",
-        trust_remote_code=False,
-        dtype="auto",  # Use model's native dtype
-        seed=0,
-        max_model_len=1024,
-    )
-
-    cache_config = CacheConfig(
-        block_size=config.block_size,
-        cache_dtype="auto",
-        swap_space=0,
-    )
-    cache_config.num_gpu_blocks = max_num_blocks
-    cache_config.num_cpu_blocks = 0
-
-    parallel_config = ParallelConfig(tensor_parallel_size=1)
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=256,
-        max_num_batched_tokens=8192,
-        max_model_len=8192,
-        is_encoder_decoder=False,
-        enable_chunked_prefill=True,
-    )
-    device_config = DeviceConfig()
-    load_config = LoadConfig()
-    compilation_config = CompilationConfig()
-
-    # Add mock methods for benchmark config values
-    model_config.get_num_layers = types.MethodType(
-        lambda self: config.num_layers, model_config
-    )
-    model_config.get_sliding_window_for_layer = types.MethodType(
-        lambda self, i: None, model_config
-    )
-    model_config.get_logits_soft_cap_for_layer = types.MethodType(
-        lambda self, i: 0.0, model_config
-    )
-    model_config.get_sm_scale_for_layer = types.MethodType(
-        lambda self, i: 1.0 / config.head_dim**0.5, model_config
-    )
-    model_config.get_num_attention_heads = types.MethodType(
-        lambda self, parallel_config=None: config.num_q_heads, model_config
-    )
-    model_config.get_num_kv_heads = types.MethodType(
-        lambda self, parallel_config=None: config.num_kv_heads, model_config
-    )
-    model_config.get_head_size = types.MethodType(
-        lambda self: config.head_dim, model_config
-    )
-    model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
-
-    return VllmConfig(
-        model_config=model_config,
-        cache_config=cache_config,
-        parallel_config=parallel_config,
-        scheduler_config=scheduler_config,
-        device_config=device_config,
-        load_config=load_config,
-        compilation_config=compilation_config,
-    )
-
-
-# ============================================================================
-# Backend Initialization
-# ============================================================================
-
-
-def _create_backend_impl(
-    backend_cfg: dict,
-    config: BenchmarkConfig,
-    device: torch.device,
-    dtype: torch.dtype,
-):
-    """Create backend implementation instance."""
-    backend_class = backend_cfg["backend_class"]
-
-    scale = get_attention_scale(config.head_dim)
-
-    impl = backend_class.get_impl_cls()(
-        num_heads=config.num_q_heads,
-        head_size=config.head_dim,
-        scale=scale,
-        num_kv_heads=config.num_kv_heads,
-        alibi_slopes=None,
-        sliding_window=None,
-        kv_cache_dtype="auto",
-    )
-
-    kv_cache_spec = FullAttentionSpec(
-        block_size=config.block_size,
-        num_kv_heads=config.num_kv_heads,
-        head_size=config.head_dim,
-        dtype=dtype,
-    )
-
-    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
-
-    return backend_class, impl, layer
-
-
-def _create_metadata_builder(
-    backend_class,
-    kv_cache_spec: FullAttentionSpec,
-    vllm_config: VllmConfig,
-    device: torch.device,
-    backend_name: str = "",
-):
-    """Create metadata builder instance."""
-    layer_names = ["layer_0"]
-    builder_cls = backend_class.get_builder_cls()
-
-    # Flashinfer needs get_per_layer_parameters mocked since we don't have
-    # real model layers registered
-    if backend_name == "FLASHINFER":
-        import unittest.mock
-
-        from vllm.v1.attention.backends.utils import PerLayerParameters
-
-        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
-            head_size = vllm_config.model_config.get_head_size()
-            return {
-                layer_name: PerLayerParameters(
-                    window_left=-1,  # No sliding window
-                    logits_soft_cap=0.0,  # No soft cap
-                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
-                )
-                for layer_name in layer_names
-            }
-
-        with unittest.mock.patch(
-            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
-            mock_get_per_layer_parameters,
-        ):
-            return builder_cls(
-                kv_cache_spec=kv_cache_spec,
-                layer_names=layer_names,
-                vllm_config=vllm_config,
-                device=device,
-            )
-
-    return builder_cls(
-        kv_cache_spec=kv_cache_spec,
-        layer_names=layer_names,
-        vllm_config=vllm_config,
-        device=device,
-    )
-
-
-# ============================================================================
-# Tensor Creation Helpers
-# ============================================================================
-
-
-def _create_input_tensors(
-    config: BenchmarkConfig,
-    total_q: int,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> tuple:
-    """Create Q, K, V input tensors for all layers."""
-    q_list = [
-        torch.randn(
-            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-        )
-        for _ in range(config.num_layers)
-    ]
-    k_list = [
-        torch.randn(
-            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
-        )
-        for _ in range(config.num_layers)
-    ]
-    v_list = [
-        torch.randn(
-            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
-        )
-        for _ in range(config.num_layers)
-    ]
-    return q_list, k_list, v_list
-
-
-def _create_kv_cache(
-    config: BenchmarkConfig,
-    max_num_blocks: int,
-    backend_class,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> list:
-    """Create KV cache tensors for all layers using the backend's methods.
-
-    Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
-    to create the cache with the correct shape and memory layout.
-    """
-    # Get the logical shape from the backend
-    cache_shape = backend_class.get_kv_cache_shape(
-        num_blocks=max_num_blocks,
-        block_size=config.block_size,
-        num_kv_heads=config.num_kv_heads,
-        head_size=config.head_dim,
-    )
-
-    # Get the stride order for custom memory layout
-    try:
-        stride_order = backend_class.get_kv_cache_stride_order()
-        assert len(stride_order) == len(cache_shape)
-    except (AttributeError, NotImplementedError):
-        stride_order = tuple(range(len(cache_shape)))
-
-    # Permute shape to physical layout order
-    physical_shape = tuple(cache_shape[i] for i in stride_order)
-
-    # Compute inverse permutation to get back to logical view
-    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
-
-    cache_list = []
-    for _ in range(config.num_layers):
-        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
-        # Permute to logical view
-        cache = cache.permute(*inv_order)
-        cache_list.append(cache)
-
-    return cache_list
-
-
-# ============================================================================
-# Benchmark Execution
-# ============================================================================
-
-
-def _run_single_benchmark(
-    config: BenchmarkConfig,
-    impl,
-    layer,
-    q_list: list,
-    k_list: list,
-    v_list: list,
-    cache_list: list,
-    attn_metadata,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> tuple:
-    """Run single benchmark iteration with warmup and timing loop."""
-    total_q = q_list[0].shape[0]
-    out = torch.empty(
-        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-    )
-
-    # Warmup
-    for _ in range(config.warmup_iters):
-        for i in range(config.num_layers):
-            impl.forward(
-                layer,
-                q_list[i],
-                k_list[i],
-                v_list[i],
-                cache_list[i],
-                attn_metadata,
-                output=out,
-            )
-    torch.cuda.synchronize()
-
-    # Benchmark
-    times = []
-    for _ in range(config.repeats):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-
-        start.record()
-        for i in range(config.num_layers):
-            impl.forward(
-                layer,
-                q_list[i],
-                k_list[i],
-                v_list[i],
-                cache_list[i],
-                attn_metadata,
-                output=out,
-            )
-        end.record()
-
-        torch.cuda.synchronize()
-        elapsed_ms = start.elapsed_time(end)
-        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
-
-    mem_stats = {}
-    if config.profile_memory:
-        mem_stats = {
-            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
-        }
-
-    return times, mem_stats
-
-
-# ============================================================================
-# Public API
-# ============================================================================
-
-
-def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
-    """
-    Run standard attention benchmark with real kernels.
-
-    Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
-
-    Args:
-        config: Benchmark configuration
-
-    Returns:
-        BenchmarkResult with timing and memory statistics
-    """
-    device = torch.device(config.device)
-    torch.cuda.set_device(device)
-
-    backend_cfg = _get_backend_config(config.backend)
-
-    requests = parse_batch_spec(config.batch_spec)
-
-    if config.backend == "FLASHINFER":
-        requests = reorder_for_flashinfer(requests)
-
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
-    max_kv = max(kv_lens)
-    batch_size = len(q_lens)
-
-    # Calculate total blocks needed: batch_size * max_blocks_per_request
-    max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
-    max_num_blocks = batch_size * max_blocks_per_request
-
-    # Suppress vLLM logs during setup to reduce spam
-    with log_warnings_and_errors_only():
-        # Create vllm_config first - uses model's native dtype via "auto"
-        vllm_config = _create_vllm_config(config, max_num_blocks)
-        dtype = vllm_config.model_config.dtype
-
-        # Wrap everything in set_current_vllm_config context
-        # This is required for backends like flashinfer that need global config
-        with set_current_vllm_config(vllm_config):
-            backend_class, impl, layer = _create_backend_impl(
-                backend_cfg, config, device, dtype
-            )
-
-            # Set KV cache layout if the backend requires a specific one
-            # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
-            required_layout = backend_class.get_required_kv_cache_layout()
-            if required_layout is not None:
-                set_kv_cache_layout(required_layout)
-                get_kv_cache_layout.cache_clear()
-
-            common_metadata = _build_common_attn_metadata(
-                q_lens, kv_lens, config.block_size, device
-            )
-
-            kv_cache_spec = FullAttentionSpec(
-                block_size=config.block_size,
-                num_kv_heads=config.num_kv_heads,
-                head_size=config.head_dim,
-                dtype=dtype,
-            )
-
-            builder = _create_metadata_builder(
-                backend_class, kv_cache_spec, vllm_config, device, config.backend
-            )
-
-            attn_metadata = builder.build(
-                common_prefix_len=0,
-                common_attn_metadata=common_metadata,
-            )
-
-            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
-            )
-
-            cache_list = _create_kv_cache(
-                config, max_num_blocks, backend_class, device, dtype
-            )
-
-            times, mem_stats = _run_single_benchmark(
-                config,
-                impl,
-                layer,
-                q_list,
-                k_list,
-                v_list,
-                cache_list,
-                attn_metadata,
-                device,
-                dtype,
-            )
-
-    mean_time = np.mean(times)
-    throughput = total_q / mean_time if mean_time > 0 else 0
-
-    return BenchmarkResult(
-        config=config,
-        mean_time=mean_time,
-        std_time=np.std(times),
-        min_time=np.min(times),
-        max_time=np.max(times),
-        throughput_tokens_per_sec=throughput,
-        memory_allocated_mb=mem_stats.get("allocated_mb"),
-        memory_reserved_mb=mem_stats.get("reserved_mb"),
-    )
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
 echo "RESULT_FILE=$RESULT"
 echo "====================== AUTO TUNEPARAMETERS ===================="

-rm -rf "$LOG_FOLDER"
-rm -rf "$PROFILE_PATH"
-mkdir -p "$LOG_FOLDER"
-mkdir -p "$PROFILE_PATH"
+rm -rf $LOG_FOLDER
+rm -rf $PROFILE_PATH
+mkdir -p $LOG_FOLDER
+mkdir -p $PROFILE_PATH

 cd "$BASE/vllm"

@@ -114,7 +114,7 @@ start_server() {

    # wait for 10 minutes...
    server_started=0
-    for _ in {1..60}; do
+    for i in {1..60}; do
        # This line checks whether the server is still alive or not,
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break
@@ -145,12 +145,12 @@ run_benchmark() {
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
-    rm -f "$vllm_log"
+    rm -f $vllm_log
    pkill -if "vllm serve" || true

    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
-    start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -168,15 +168,15 @@ run_benchmark() {
    # --profile flag is removed from this call
    vllm bench serve \
        --backend vllm \
-        --model "$MODEL"  \
+        --model $MODEL  \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len "$OUTPUT_LEN" \
+        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
@@ -195,20 +195,20 @@ run_benchmark() {
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
-            curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            vllm bench serve \
                --backend vllm \
-                --model "$MODEL"  \
+                --model $MODEL  \
                --dataset-name random \
                --random-input-len $adjusted_input_len \
-                --random-output-len "$OUTPUT_LEN" \
+                --random-output-len $OUTPUT_LEN \
                --ignore-eos \
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
-                --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
                --random-prefix-len $prefix_len \
                --host "$HOSTNAME" \
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
    # Pass empty string for profile_dir argument
-    start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
@@ -274,7 +274,7 @@ fi

 for num_seqs in "${num_seqs_list[@]}"; do
    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
-        run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
    done
 done
 echo "finish permutations"
@@ -285,7 +285,7 @@ echo "finish permutations"
 if (( $(echo "$best_throughput > 0" | bc -l) )); then
    echo
    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
-    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
    echo

    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then

    # Start server with the best params and profiling ENABLED
    echo "Starting server for profiling..."
-    start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
+    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"

    # Run benchmark with the best params and the --profile flag
    echo "Running benchmark with profiling..."
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    vllm bench serve \
        --backend vllm \
-        --model "$MODEL" \
+        --model $MODEL \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len "$OUTPUT_LEN" \
+        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
-        --request-rate "$best_request_rate" \
+        --request-rate $best_request_rate \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
  else
    STATUS="FAILURE"
    ((FAILURE_COUNT++))
-    FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
+    FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
  fi

  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -1,471 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
-
-Compares:
- apply_top_k_top_p_triton (Triton binary search)
- apply_top_k_top_p (PyTorch sort-based)
-
-Scenarios:
- top_k only (whole batch, partial batch)
- top_p only (whole batch, partial batch)
- mix of top_k and top_p
-"""
-
-import argparse
-import gc
-from dataclasses import dataclass
-
-import torch
-
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
-from vllm.v1.sample.ops.topk_topp_triton import (
-    apply_top_k_top_p_triton,
-    reset_buffer_cache,
-)
-
-
-@dataclass
-class BenchmarkConfig:
-    """Configuration for a benchmark run."""
-
-    name: str
-    batch_size: int
-    vocab_size: int
-    # k and p can be tensors or None
-    k_values: torch.Tensor | None  # [batch_size] or None
-    p_values: torch.Tensor | None  # [batch_size] or None
-    description: str
-    ops_pct: float = 0.0  # Percentage of ops relative to batch size
-
-
-def calculate_ops_pct(
-    k_values: torch.Tensor | None,
-    p_values: torch.Tensor | None,
-    vocab_size: int,
-    batch_size: int,
-) -> float:
-    """
-    Calculate the percentage of active top-k and top-p operations.
-
-    Returns percentage where 100% = batch_size ops.
-    E.g., if all rows have both top-k and top-p active, returns 200%.
-    """
-    active_ops = 0
-
-    if k_values is not None:
-        # Count rows where k < vocab_size (active top-k filtering)
-        active_ops += (k_values < vocab_size).sum().item()
-
-    if p_values is not None:
-        # Count rows where p < 1.0 (active top-p filtering)
-        active_ops += (p_values < 1.0).sum().item()
-
-    return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
-
-
-def create_logits(
-    batch_size: int, vocab_size: int, device: str = "cuda"
-) -> torch.Tensor:
-    """Create random logits mimicking a realistic LLM distribution.
-
-    Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
-    via log, then randomly permuted per row. This produces a peaked distribution
-    where a small number of tokens capture most probability mass, similar to
-    real model outputs.
-    """
-    # Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
-    ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
-    probs = ranks.pow(-1.1)
-    probs = probs / probs.sum()
-
-    # Convert to logits (log-probabilities, unnormalized is fine)
-    base_logits = probs.log()
-
-    # Broadcast to batch and randomly permute each row
-    logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
-    for i in range(batch_size):
-        logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
-
-    return logits
-
-
-def measure_memory() -> tuple[int, int]:
-    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
-
-
-def reset_memory_stats():
-    """Reset peak memory statistics."""
-    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    gc.collect()
-
-
-def benchmark_function(
-    func,
-    logits: torch.Tensor,
-    k: torch.Tensor | None,
-    p: torch.Tensor | None,
-    warmup_iters: int = 5,
-    benchmark_iters: int = 20,
-) -> tuple[float, int]:
-    """
-    Benchmark a function and return (avg_time_ms, peak_memory_bytes).
-
-    Returns average time in milliseconds and peak memory usage.
-    """
-    # Warmup
-    for _ in range(warmup_iters):
-        logits_copy = logits.clone()
-        func(logits_copy, k, p)
-    torch.cuda.synchronize()
-
-    # Reset memory stats before benchmark
-    reset_memory_stats()
-
-    # Benchmark
-    start_events = [
-        torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
-    ]
-    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
-
-    for i in range(benchmark_iters):
-        logits_copy = logits.clone()
-        start_events[i].record()
-        func(logits_copy, k, p)
-        end_events[i].record()
-
-    torch.cuda.synchronize()
-
-    # Calculate timing
-    times = [
-        start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
-    ]
-    avg_time = sum(times) / len(times)
-
-    # Get peak memory
-    _, peak_memory = measure_memory()
-
-    return avg_time, peak_memory
-
-
-def create_benchmark_configs(
-    batch_sizes: list[int],
-    vocab_sizes: list[int],
-    device: str = "cuda",
-) -> list[BenchmarkConfig]:
-    """Create all benchmark configurations."""
-    configs = []
-
-    for vocab_size in vocab_sizes:
-        for batch_size in batch_sizes:
-            # 1. Top-k only - whole batch (all rows have k < vocab_size)
-            k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_all,
-                    p_values=None,
-                    description=f"Top-k only (whole batch, k=50), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
-                )
-            )
-
-            # 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
-            k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
-            k_partial[batch_size // 2 :] = vocab_size  # No filtering for second half
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_partial,
-                    p_values=None,
-                    description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
-                )
-            )
-
-            # 3. Top-p only - whole batch (all rows have p < 1.0)
-            p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=None,
-                    p_values=p_all,
-                    description=f"Top-p only (whole batch, p=0.9), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
-                )
-            )
-
-            # 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
-            p_partial = torch.full(
-                (batch_size,), 0.9, dtype=torch.float32, device=device
-            )
-            p_partial[batch_size // 2 :] = 1.0  # No filtering for second half
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=None,
-                    p_values=p_partial,
-                    description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
-                )
-            )
-
-            # 5. Mix of top-k and top-p (both applied to whole batch)
-            k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
-            p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_mix,
-                    p_values=p_mix,
-                    description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
-                )
-            )
-
-            # 6. Mix with partial application (some rows k only, some p only, some both)
-            k_mixed = torch.full(
-                (batch_size,), vocab_size, dtype=torch.int32, device=device
-            )
-            p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
-            # First third: k only
-            third = batch_size // 3
-            k_mixed[:third] = 50
-            # Second third: p only
-            p_mixed[third : 2 * third] = 0.5
-            # Last third: both k and p
-            k_mixed[2 * third :] = 100
-            p_mixed[2 * third :] = 0.9
-            configs.append(
-                BenchmarkConfig(
-                    name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_mixed,
-                    p_values=p_mixed,
-                    description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
-                )
-            )
-
-    return configs
-
-
-def format_memory(bytes_val: int) -> str:
-    """Format memory in human-readable form."""
-    if bytes_val >= 1024**3:
-        return f"{bytes_val / (1024**3):.2f} GB"
-    elif bytes_val >= 1024**2:
-        return f"{bytes_val / (1024**2):.2f} MB"
-    elif bytes_val >= 1024:
-        return f"{bytes_val / 1024:.2f} KB"
-    return f"{bytes_val} B"
-
-
-def run_benchmark(
-    configs: list[BenchmarkConfig],
-    warmup_iters: int = 5,
-    benchmark_iters: int = 20,
-    verbose: bool = True,
-):
-    """Run all benchmarks and print results."""
-    results = []
-
-    print("=" * 100)
-    print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
-    print("=" * 100)
-    print()
-
-    for config in configs:
-        if verbose:
-            print(f"Running: {config.description}")
-
-        # Create fresh logits for this config
-        logits = create_logits(config.batch_size, config.vocab_size)
-
-        # Benchmark Triton
-        reset_memory_stats()
-        triton_time, triton_mem = benchmark_function(
-            apply_top_k_top_p_triton,
-            logits,
-            config.k_values,
-            config.p_values,
-            warmup_iters,
-            benchmark_iters,
-        )
-
-        # Benchmark PyTorch
-        reset_memory_stats()
-        pytorch_time, pytorch_mem = benchmark_function(
-            apply_top_k_top_p_pytorch,
-            logits,
-            config.k_values,
-            config.p_values,
-            warmup_iters,
-            benchmark_iters,
-        )
-
-        speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
-        mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
-
-        result = {
-            "config": config,
-            "triton_time_ms": triton_time,
-            "pytorch_time_ms": pytorch_time,
-            "triton_mem": triton_mem,
-            "pytorch_mem": pytorch_mem,
-            "speedup": speedup,
-            "mem_ratio": mem_ratio,
-        }
-        results.append(result)
-
-        if verbose:
-            print(f"  Triton:  {triton_time:.3f} ms, {format_memory(triton_mem)}")
-            print(f"  PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
-            print(f"  Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
-            print()
-
-        # Clean up
-        del logits
-        reset_memory_stats()
-
-    return results
-
-
-def print_summary_table(results: list[dict]):
-    """Print a summary table of results."""
-    print()
-    print("=" * 130)
-    print("SUMMARY TABLE")
-    print("=" * 130)
-    print()
-
-    # Header
-    header = (
-        f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
-        f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
-        f"{'Tri Mem':>10} {'Pyt Mem':>10}"
-    )
-    print(header)
-    print("-" * 130)
-
-    # Group by scenario type
-    current_vocab = None
-    for result in results:
-        config = result["config"]
-
-        # Add separator between vocab sizes
-        if current_vocab != config.vocab_size:
-            if current_vocab is not None:
-                print("-" * 130)
-            current_vocab = config.vocab_size
-
-        scenario = config.name.split("_b")[0]  # Extract scenario name
-        print(
-            f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
-            f"{config.ops_pct:>5.0f}% "
-            f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
-            f"{result['speedup']:>7.2f}x "
-            f"{format_memory(result['triton_mem']):>10} "
-            f"{format_memory(result['pytorch_mem']):>10}"
-        )
-
-    print("=" * 130)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
-    )
-    parser.add_argument(
-        "--batch-sizes",
-        type=int,
-        nargs="+",
-        default=[1, 4, 16, 64, 128, 512, 1024, 2048],
-        help="Batch sizes to test (default: 1 4 16 64)",
-    )
-    parser.add_argument(
-        "--vocab-sizes",
-        type=int,
-        nargs="+",
-        default=[32768, 131072],  # 32k, 128k
-        help="Vocabulary sizes to test (default: 32768 131072)",
-    )
-    parser.add_argument(
-        "--warmup-iters",
-        type=int,
-        default=5,
-        help="Number of warmup iterations (default: 5)",
-    )
-    parser.add_argument(
-        "--benchmark-iters",
-        type=int,
-        default=20,
-        help="Number of benchmark iterations (default: 20)",
-    )
-    parser.add_argument(
-        "--quiet",
-        action="store_true",
-        help="Only print summary table",
-    )
-
-    args = parser.parse_args()
-
-    # Print configuration
-    print(f"Batch sizes: {args.batch_sizes}")
-    print(f"Vocab sizes: {args.vocab_sizes}")
-    print(f"Warmup iterations: {args.warmup_iters}")
-    print(f"Benchmark iterations: {args.benchmark_iters}")
-    print()
-
-    # Check CUDA
-    if not torch.cuda.is_available():
-        print("ERROR: CUDA is not available. This benchmark requires a GPU.")
-        return
-
-    device_name = torch.cuda.get_device_name(0)
-    print(f"GPU: {device_name}")
-    print()
-
-    # Create configs
-    configs = create_benchmark_configs(
-        args.batch_sizes,
-        args.vocab_sizes,
-    )
-
-    # Run benchmarks
-    results = run_benchmark(
-        configs,
-        warmup_iters=args.warmup_iters,
-        benchmark_iters=args.benchmark_iters,
-        verbose=not args.quiet,
-    )
-
-    # Print summary
-    print_summary_table(results)
-
-
-if __name__ == "__main__":
-    main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Nick Hill	1892993bc1	[BugFix][Spec Decoding] Fix negative accepted tokens metric crash (#33729 ) Signed-off-by: Nick Hill <nickhill123@gmail.com>	2026-02-03 20:28:32 -05:00
Michael Goin	7d98f09b1c	cherry pick Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2026-02-03 20:28:02 -05:00
Michael Goin	daa2784bb9	[Bugfix] Disable RoutingMethodType.[Renormalize,RenormalizeNaive] TRTLLM per-tensor FP8 MoE (#33620 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `e346e2d056`) Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2026-02-03 20:17:37 -05:00
Richard Zou	e4bf6ed90d	[torch.compile] Don't do the fast moe cold start optimization if there is speculative decoding (#33624 ) Signed-off-by: Richard Zou <zou3519@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> (cherry picked from commit `5eac9a1b34`)	2026-02-03 01:16:42 -08:00
Richard Zou	611b18757e	[torch.compile] Speed up MOE handling in forward_context (#33184 ) Signed-off-by: Richard Zou <zou3519@gmail.com> (cherry picked from commit `d9aa39a3bb`)	2026-02-03 00:24:28 -08:00
Kiersten Stokes	eec3546bba	[Misc][Build] Lazy load cv2 in nemotron_parse.py (#33189 ) Signed-off-by: kiersten-stokes <kierstenstokes@gmail.com> (cherry picked from commit `9e138cb01d`)	2026-02-03 00:03:56 -08:00
zaristei2	7c023baf58	Patch Protobuf for CVE 2026-0994 (#33619 ) Signed-off-by: Zachary Aristei <zaristei@nvidia.com> Co-authored-by: Zachary Aristei <zaristei@nvidia.com>	2026-02-03 00:03:14 -08:00
zaristei2	099a787ee2	Patch aiohttp for CVE-2025-69223 (#33621 ) Signed-off-by: Zachary Aristei <zaristei@nvidia.com> Co-authored-by: Zachary Aristei <zaristei@nvidia.com>	2026-02-03 00:02:39 -08:00
Zhewen Li	31a64c63a8	[Release] Fix format and cherry-pick (#33618 ) Signed-off-by: zhewenli <zhewen@inferact.ai> Co-authored-by: zhewenli <zhewen@inferact.ai>	2026-02-02 16:19:05 -08:00
Zhewen Li	57eae2f891	[Release] patch step3p5 attention class in v0.15.1 release (#33602 ) Signed-off-by: zhewenli <zhewen@inferact.ai> Co-authored-by: zhewenli <zhewen@inferact.ai>	2026-02-02 14:54:08 -08:00
Yifan Qiao	f0d005864a	[Fix] prefix cache hit rate == 0 bug with gpt-oss style models (#33524 ) Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu> (cherry picked from commit `a01ef3fa51`)	2026-02-02 10:31:50 -08:00
Robert Shaw	94cbe0a328	[Nightly CI] Remove CT Model (#33530 ) Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> (cherry picked from commit `318b120766`)	2026-02-02 02:17:42 -08:00
csy0225	8b45c58fe9	[Models] Step-3.5-Flash (#33523 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: i-zhangmingming <i-zhangmingming@stepfun.com> Co-authored-by: xiewuxun <xiewuxun@stepfun.com> Co-authored-by: zetaohong <i-hongzetao@stepfun.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> (cherry picked from commit `c3b40dc3e7`)	2026-02-02 02:16:23 -08:00
Greg Pereira	c7039a80b8	pin LMCache to v0.3.9 or greater with vLLM v0.15.0 (#33440 ) Signed-off-by: greg pereira <grpereir@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> (cherry picked from commit `d6416fdde9`)	2026-02-02 00:17:01 -08:00
René Honig	15ebd0cedf	fix: Add SM120 (RTX Blackwell) support for FlashInfer CUTLASS NVFP4 MoE kernels (#33417 ) Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `079781177a`)	2026-02-02 00:15:22 -08:00
Luka Govedič	2915268369	[fix][torch.compile] Fix cold-start compilation time increase by adding kv cache update to splitting ops (#33441 ) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Richard Zou <zou3519@gmail.com> (cherry picked from commit `15f40b20aa`)	2026-02-02 00:14:07 -08:00
Lucas Wilkinson	d984d664cc	[BugFix] Fix whisper FA2 + full cudagraphs (#33360 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> (cherry picked from commit `0a3c71e7e5`)	2026-02-02 00:13:57 -08:00
Gregory Shtrasberg	5f45b0b7e0	[Bugfix][ROCm] Fixing the skinny gemm dispatch logic from #32831 (#33366 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> (cherry picked from commit `31aedfe7d6`)	2026-02-02 00:13:45 -08:00
Kevin H. Luu	a2dba556db	[release] Minor fixes to release annotation and wheel upload (#33129 ) Signed-off-by: khluu <khluu000@gmail.com> (cherry picked from commit `2284461d02`)	2026-02-02 00:13:34 -08:00
Michael Goin	6ff16b77f8	[Bugfix] Enable Triton MoE for FP8 per-tensor dynamic (#33300 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `bfb9bdaf3f`)	2026-02-02 00:13:23 -08:00
wang.yuqi	1ed963d43a	[Bugfix] Fix Qwen3-VL-Reranker load. (#33298 ) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> (cherry picked from commit `abb34ac43a`)	2026-02-02 00:13:12 -08:00
Michael Goin	39e8b49378	[Bugfix] Register fp8 cutlass_group_gemm as supported for only SM90+SM100 (#33285 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `1bd47d6e5a`)	2026-02-02 00:12:58 -08:00
TJian	f176443446	[Release] [CI] Optim release pipeline (#33156 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> (cherry picked from commit `f9d03599ef`)	2026-01-28 22:47:10 -08:00
Or Ozeri	fe18ce4d3f	Revert "Enable Cross layers KV cache layout at NIXL Connector (#30207 )" (#33241 ) Signed-off-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `2e8de86777`)	2026-01-28 11:44:59 -08:00
Jeffrey Wang	5f7f9ea884	Relax protobuf library version constraints (#33202 ) Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com> (cherry picked from commit `a97b5e206d`)	2026-01-28 02:17:19 -08:00
Nick Hill	7779de34da	[BugFix] Fix P/D with non-MoE DP (#33037 ) Signed-off-by: Nick Hill <nickhill123@gmail.com> (cherry picked from commit `0cd259b2d8`)	2026-01-28 02:17:08 -08:00
Nicolò Lucchesi	0d8ce320a2	[Bugfix] Fix DeepseekV32 `AssertionError: num_kv_heads == 1` (#33090 ) Signed-off-by: NickLucche <nlucches@redhat.com> (cherry picked from commit `492a7983dd`)	2026-01-28 02:16:56 -08:00
Nicolò Lucchesi	d51e1f8b62	[Bugfix] Disable CG for Whisper+FA2 (#33164 ) Signed-off-by: NickLucche <nlucches@redhat.com> (cherry picked from commit `1f3a2c2944`)	2026-01-28 02:16:41 -08:00
Roger Wang	5042815ab6	[Models] Kimi-K2.5 (#33131 ) Signed-off-by: wanglinian <wanglinian@stu.pku.edu.cn> Signed-off-by: wangln19 <96399074+wangln19@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: wanglinian <wanglinian@stu.pku.edu.cn> Co-authored-by: wangln19 <96399074+wangln19@users.noreply.github.com> Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> (cherry picked from commit `b539f988e1`)	2026-01-28 02:16:28 -08:00
Chauncey	afb390ab02	[CI] Fix AssertionError: MCP tool call not found in output_messages (#33093 ) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> (cherry picked from commit `a2393ed496`)	2026-01-28 02:16:14 -08:00
Robert Shaw	cf1167e50b	[Bugfix] Fix Dtypes for Pynccl Wrapper (#33030 ) Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> (cherry picked from commit `43a013c3a2`)	2026-01-26 12:37:16 -08:00