[CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032 )

(cherry picked from commit 8e61425ee6)
[Frontend] Standardize use of create_error_response (#32319 )
2026-01-16 21:04:48 -08:00 · 2026-01-16 11:35:10 +00:00 · 2026-01-16 10:50:00 +08:00 · 2026-01-15 18:00:21 -08:00 · 2026-01-15 17:59:58 -08:00 · 2026-01-15 17:55:20 -08:00
2513 changed files with 85704 additions and 277719 deletions
--- a/.buildkite/.pipeline_gen_v2
+++ b/.buildkite/.pipeline_gen_v2
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,8 +1,7 @@
 name: vllm_ci
 job_dirs:
-  - ".buildkite/image_build"
  - ".buildkite/test_areas"
-  - ".buildkite/hardware_tests"
+  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,30 +0,0 @@
-group: Hardware - AMD Build 
-steps:
-  - label: "AMD: :docker: build image"
-    key: image-build-amd
-    depends_on: []
-    device: amd_cpu
-    no_plugin: true
-    commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 1
-        - exit_status: -10  # Agent was lost
-          limit: 1
-        - exit_status: 1  # Machine occasionally fail
-          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Ascend NPU Test"
-    soft_fail: true
-    timeout_in_minutes: 20
-    no_plugin: true
-    device: ascend_npu
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -1,114 +0,0 @@
-group: CPU
-depends_on: []
-steps:
- label: CPU-Kernel Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - cmake/cpu_extension.cmake
-  - CMakeLists.txt
-  - vllm/_custom_ops.py
-  - tests/kernels/attention/test_cpu_attn.py
-  - tests/kernels/moe/test_cpu_fused_moe.py
-  - tests/kernels/test_onednn.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-      pytest -x -v -s tests/kernels/test_onednn.py"
-
- label: CPU-Compatibility Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - cmake/cpu_extension.cmake
-  - setup.py
-  - vllm/platforms/cpu.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
-
- label: CPU-Language Generation and Pooling Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/
-  - tests/models/language/generation/
-  - tests/models/language/pooling/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
-      pytest -x -v -s tests/models/language/generation -m cpu_model
-      pytest -x -v -s tests/models/language/pooling -m cpu_model"
-
- label: CPU-Quantization Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
-  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
-  - tests/quantization/test_compressed_tensors.py
-  - tests/quantization/test_cpu_wna16.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
-      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-      
- label: CPU-Distributed Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/shm.cpp
-  - vllm/v1/worker/cpu_worker.py
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/worker/cpu_model_runner.py
-  - vllm/v1/worker/gpu_model_runner.py
-  - vllm/platforms/cpu.py
-  - vllm/distributed/parallel_state.py
-  - vllm/distributed/device_communicators/cpu_communicator.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
-
- label: CPU-Multi-Modal Model Tests %N
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  # - vllm/
-  - vllm/model_executor/layers/rotary_embedding
-  - tests/models/multimodal/generation/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
-      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
-  parallelism: 2
-
- label: "Arm CPU Test"
-  depends_on: []
-  soft_fail: true
-  device: arm_cpu
-  no_plugin: true
-  commands: 
-  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-steps:
-  - label: "GH200 Test"
-    soft_fail: true
-    device: gh200
-    no_plugin: true
-    optional: true
-    commands: 
-    - nvidia-smi 
-    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,17 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Intel HPU Test"
-    soft_fail: true
-    device: intel_hpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,255 +1,56 @@
 #!/bin/bash
-set -euo pipefail
+set -e

-# replace invalid characters in Docker image tags and truncate to 128 chars
-clean_docker_tag() {
-    local input="$1"
-    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
-}
-
-print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
-    exit 1
-}
-
-print_instance_info() {
-    echo ""
-    echo "=== Debug: Instance Information ==="
-    # Get IMDSv2 token
-    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
-        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
-        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
-        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
-        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
-        echo "AMI ID:        ${AMI_ID}"
-        echo "Instance Type: ${INSTANCE_TYPE}"
-        echo "Instance ID:   ${INSTANCE_ID}"
-        echo "AZ:            ${AZ}"
-    else
-        echo "Not running on EC2 or IMDS not available"
-    fi
-    # Check for warm cache AMI (marker file baked into custom AMI)
-    if [[ -f /etc/vllm-ami-info ]]; then
-        echo "Cache:         warm (custom vLLM AMI)"
-        cat /etc/vllm-ami-info
-    else
-        echo "Cache:         cold (standard AMI)"
-    fi
-    echo "==================================="
-    echo ""
-}
-
-setup_buildx_builder() {
-    echo "--- :buildkite: Setting up buildx builder"
-    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
-        # Custom AMI with standalone buildkitd - use remote driver for warm cache
-        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
-        echo "Using remote driver to connect to buildkitd (warm cache available)"
-        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
-            echo "Using existing baked-vllm-builder"
-            docker buildx use baked-vllm-builder
-        else
-            echo "Creating baked-vllm-builder with remote driver"
-            docker buildx create \
-                --name baked-vllm-builder \
-                --driver remote \
-                --use \
-                "unix://${BUILDKIT_SOCKET}"
-        fi
-        docker buildx inspect --bootstrap
-    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
-        # Existing builder available
-        echo "Using existing builder: ${BUILDER_NAME}"
-        docker buildx use "${BUILDER_NAME}"
-        docker buildx inspect --bootstrap
-    else
-        # No local buildkitd, no existing builder - create new docker-container builder
-        echo "No local buildkitd found, using docker-container driver"
-        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
-        docker buildx inspect --bootstrap
-    fi
-
-    # builder info
-    echo "Active builder:"
-    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
-}
-
-check_and_skip_if_image_exists() {
-    if [[ -n "${IMAGE_TAG:-}" ]]; then
-        echo "--- :mag: Checking if image exists"
-        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
-            echo "Image already exists: ${IMAGE_TAG}"
-            echo "Skipping build"
-            exit 0
-        fi
-        echo "Image not found, proceeding with build"
-    fi
-}
-
-ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
-}
-
-prepare_cache_tags() {
-    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
-    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
-    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
-
-    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-            cache="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
-            cache="${TEST_CACHE_ECR}:${clean_branch}"
-        fi
-        CACHE_TO="$cache"
-        CACHE_FROM="$cache"
-        CACHE_FROM_BASE_BRANCH="$cache"
-    else
-        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
-            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
-            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
-        fi
-    fi
-
-    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
-    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
-}
-
-resolve_parent_commit() {
-    if [[ -z "${PARENT_COMMIT:-}" ]]; then
-        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
-        if [[ -n "${PARENT_COMMIT}" ]]; then
-            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
-            export PARENT_COMMIT
-        else
-            echo "Could not determine parent commit (may be first commit in repo)"
-        fi
-    else
-        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
-    fi
-}
-
-print_bake_config() {
-    echo "--- :page_facing_up: Resolved bake configuration"
-    # Write to a temp directory to avoid polluting the repo root (which is the
-    # Docker build context). Files left in the repo root get COPY'd into the
-    # image and can cause duplicate artifact uploads from downstream steps.
-    local bake_tmp
-    bake_tmp="$(mktemp -d)"
-    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
-    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
-    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
-    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
-}
-
-#################################
-#         Main Script           #
-#################################
-print_instance_info
-
-if [[ $# -lt 5 ]]; then
-    print_usage_and_exit
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
 fi

-# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-IMAGE_TAG=$5
-IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8

-# build config
-TARGET="test-ci"
-VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
-BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
-CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
-CI_HCL_PATH="/tmp/ci.hcl"
-BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com

-prepare_cache_tags
-ecr_login
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls

-# Environment info (for docs and human readers)
-#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
-#
-# Build configuration (exported as environment variables for bake):
-export BUILDKITE_COMMIT
-export PARENT_COMMIT
-export IMAGE_TAG
-export IMAGE_TAG_LATEST
-export CACHE_FROM
-export CACHE_FROM_BASE_BRANCH
-export CACHE_FROM_MAIN
-export CACHE_TO
-
-# print args
-echo "--- :mag: Arguments"
-echo "REGISTRY: ${REGISTRY}"
-echo "REPO: ${REPO}"
-echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
-echo "BRANCH: ${BRANCH}"
-echo "IMAGE_TAG: ${IMAGE_TAG}"
-echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
-
-# print build configuration
-echo "--- :mag: Build configuration"
-echo "TARGET: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "BUILDER_NAME: ${BUILDER_NAME}"
-echo "CI_HCL_URL: ${CI_HCL_URL}"
-echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
-
-echo "--- :mag: Cache tags"
-echo "CACHE_TO: ${CACHE_TO}"
-echo "CACHE_FROM: ${CACHE_FROM}"
-echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
-echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
-
-check_and_skip_if_image_exists
-
-echo "--- :docker: Setting up Docker buildx bake"
-echo "Target: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "CI HCL path: ${CI_HCL_PATH}"
-
-if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
-    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
-    echo "Make sure you're running from the vLLM repository root"
-    exit 1
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
 fi

-echo "--- :arrow_down: Downloading ci.hcl"
-curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-echo "Downloaded to ${CI_HCL_PATH}"
-
-if [[ ! -f "${CI_HCL_PATH}" ]]; then
-    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
-    exit 1
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
 fi

-setup_buildx_builder
-
-resolve_parent_commit
-export PARENT_COMMIT
-
-print_bake_config
-
-echo "--- :docker: Building ${TARGET}"
-docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-
-echo "--- :white_check_mark: Build complete"
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,9 +3,8 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
-    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -41,7 +40,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-
+  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,11 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_X86=true \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -1,15 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.695
-  - name: "exact_match,flexible-extract"
-    value: 0.447
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -1,19 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.7142
-  - name: "exact_match,flexible-extract"
-    value: 0.4579
-env_vars:
-  VLLM_USE_FLASHINFER_MOE_FP8: "1"
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-kv_cache_dtype: fp8
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1,2 +1 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +0,0 @@
-Qwen2.5-1.5B-Instruct.yaml
-Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit "$LIMIT"
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -20,11 +20,14 @@ usage() {
    echo
 }

-while getopts "m:l:f:t:" OPT; do
+while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,10 +13,9 @@ import os
 from contextlib import contextmanager

 import lm_eval
+import numpy as np
 import yaml

-from vllm.platforms import current_platform
-
 DEFAULT_RTOL = 0.08


@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )

-    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
-        model_args += "attention_backend=TRITON_ATTN"
-
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-
-            min_acceptable = ground_truth * (1 - rtol)
-            success = success and measured_value >= min_acceptable
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)

    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,10 +7,8 @@ import argparse
 import html as _html
 import json
 import os
-from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
-from pathlib import Path

 import pandas as pd

@@ -33,45 +31,6 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")


-# -----------------------------
-# Concurrency normalization (NEW, small)
-# -----------------------------
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-
-    for c in df.columns:
-        if "concurr" in str(c).lower():
-            s = df[c]
-            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
-                return c
-
-    raise ValueError(
-        "Cannot infer concurrency column. "
-        "Please rename the column to one of the known names "
-        "or add an explicit override (e.g., --concurrency-col)."
-    )
-
-
-def _normalize_concurrency_in_df(
-    df: pd.DataFrame, canonical: str = "# of max concurrency."
-) -> pd.DataFrame:
-    if canonical in df.columns:
-        return df
-    detected = _find_concurrency_col(df)
-    if detected in df.columns and detected != canonical:
-        return df.rename(columns={detected: canonical})
-    df[canonical] = pd.NA
-    return df
-
-
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -91,25 +50,19 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
-
-    Minimal fix to support different max_concurrency lists across files:
-      - normalize concurrency column naming to "# of max concurrency."
-      - align on UNION of keys (missing points become NaN)
-      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)

    frames = []
    raw_data_cols: list[str] = []
+    compare_frames = []

-    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
-        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +73,12 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )

-    union_index = None
-    metas: list[pd.DataFrame] = []
-    staged: list[tuple[str, pd.Series, pd.Series | None]] = []
+    meta_added = False

    for file in files:
        df = pd.read_json(file, orient="records")
-        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")

-        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
-        # NaN in P99/Median columns even if the column exists in the JSON.
-        metric_lc = str(data_column).lower()
-        is_latency_metric = (
-            "ttft" in metric_lc
-            or "tpot" in metric_lc
-            or "p99" in metric_lc
-            or "median" in metric_lc
-            or metric_lc.strip() in {"p99", "median"}
-        )
-        if is_latency_metric and drop_column in df.columns:
+        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

        for c in (
@@ -163,61 +103,35 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()

        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-
-        if data_column in df_idx.columns:
-            s = df_idx[data_column]
-            if not s.index.is_unique:
-                s = s.groupby(level=key_cols, dropna=False).mean()
-        else:
-            # keep NA series to preserve meta keys for union_index
-            s = pd.Series(pd.NA, index=meta.index)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
        s.name = file_label

-        name_s = None
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
+            frames.append(name_s)

-        if union_index is None:
-            union_index = meta.index
-        else:
-            union_index = union_index.union(meta.index)
-        metas.append(meta)
-
-        staged.append((file_label, s, name_s))
-
-    if union_index is None:
-        raise ValueError("No data found after loading inputs.")
-
-    # meta first (union-aligned): build UNION meta across all files
-    if metas:
-        meta_union = pd.concat(metas, axis=0)
-        # Collapse duplicates on the MultiIndex; keep first non-null per column
-        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
-        frames.append(meta_union.reindex(union_index))
-
-    # values + ratios (union-aligned)
-    metric_series_aligned: list[pd.Series] = []
-    for file_label, s, name_s in staged:
-        s_aligned = s.reindex(union_index)
-        frames.append(s_aligned)
+        frames.append(s)
        raw_data_cols.append(file_label)
-        metric_series_aligned.append(s_aligned)
+        compare_frames.append(s)

-        if debug and name_s is not None:
-            frames.append(name_s.reindex(union_index))
-
-        if len(metric_series_aligned) >= 2:
-            base = metric_series_aligned[0]
-            current = metric_series_aligned[-1]
-            if "P99" in str(data_column) or "Median" in str(data_column):
+        if len(compare_frames) >= 2:
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            if "P99" in data_column or "Median" in data_column:
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)

    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +202,24 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
 def _highlight_threshold(
-    df: pd.DataFrame,
-    threshold: float,
-    slack_pct: float = 0.0,
+    df: pd.DataFrame, threshold: float
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -304,24 +232,12 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]

-    try:
-        slack_pct = float(slack_pct or 0.0)
-    except Exception:
-        slack_pct = 0.0
-    slack_limit = threshold * (1.0 + slack_pct / 100.0)
-
-    def _cell(v):
-        if pd.isna(v):
-            return ""
-        if v <= threshold:
-            # Strict SLA
-            return "background-color:#e6ffe6;font-weight:bold;"
-        if v <= slack_limit:
-            # Within slack range
-            return "background-color:#ffe5cc;font-weight:bold;"
-        return ""
-
-    return df.style.map(_cell, subset=conf_cols)
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )


 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -359,177 +275,6 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


-# -----------------------------
-# Export helpers (Excel + CSV)
-# -----------------------------
-def _sanitize_sheet_name(name: str) -> str:
-    """
-    Excel sheet constraints:
-      - max 31 chars
-      - cannot contain: : \ / ? * [ ]
-      - cannot be empty
-
-    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
-    module's compile overhead/edge-cases on some systems.
-    """
-    name = "sheet" if name is None else str(name)
-
-    # Replace illegal characters with underscore.
-    trans = str.maketrans(
-        {
-            ":": "_",
-            "\\": "_",
-            "/": "_",
-            "?": "_",
-            "*": "_",
-            "[": "_",
-            "]": "_",
-        }
-    )
-    name = name.translate(trans)
-
-    # Strip quotes/spaces and collapse whitespace.
-    name = name.strip().strip("'")
-    name = " ".join(name.split())
-
-    if not name:
-        name = "sheet"
-    return name[:31]
-
-
-def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
-    d = dict(zip(group_cols, gkey_tuple))
-
-    # Always keep input/output lengths (these are important).
-    ilen = d.get("Input Len", "")
-    olen = d.get("Output Len", "")
-    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
-
-    # Shorten model name aggressively to make room for lens.
-    model = d.get("Model", "model")
-    leaf = str(model).split("/")[-1]
-
-    max_model_len = max(1, 31 - len(lens))
-    model_short = leaf[:max_model_len]
-
-    return _sanitize_sheet_name(f"{model_short}{lens}")
-
-
-def _write_tables_to_excel_sheet(
-    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
-):
-    """Write all blocks to a sheet with a single to_excel() call.
-
-    Pandas+openpyxl can be extremely slow when called many times per sheet.
-    We flatten blocks into one table with a 'Section' column to keep structure
-    while making Excel generation fast and deterministic.
-    """
-    if not blocks:
-        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
-        return
-
-    combined_parts: list[pd.DataFrame] = []
-    for title, df in blocks:
-        df2 = df.copy()
-        # Put the section label as the first column for readability.
-        df2.insert(0, "Section", title)
-        combined_parts.append(df2)
-
-    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
-    combined.to_excel(writer, sheet_name=sheet, index=False)
-
-
-def _safe_filename(s: str) -> str:
-    # Fast path without the third-party `regex` module.
-    s = " ".join(str(s).strip().split())
-    allowed = []
-    for ch in s:
-        if ch.isalnum() or ch in "._-":
-            allowed.append(ch)
-        else:
-            allowed.append("_")
-    out = "".join(allowed)
-    return out[:180] if len(out) > 180 else out
-
-
-# -----------------------------
-# vLLM environment export helper
-# -----------------------------
-def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
-    """Parse vllm_env.txt into a flat table (Section, Key, Value).
-
-    Supports:
-      - section headers as standalone lines (no ':' or '=')
-      - key-value lines like 'OS: Ubuntu ...'
-      - env var lines like 'HF_HOME=/data/hf'
-    """
-    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
-    section = "General"
-    rows: list[dict] = []
-
-    def set_section(s: str):
-        nonlocal section
-        s = (s or "").strip()
-        if s:
-            section = s
-
-    for raw in lines:
-        stripped = raw.strip()
-        if not stripped:
-            continue
-        # divider lines like =====
-        if set(stripped) <= {"="}:
-            continue
-
-        # section header heuristic: short standalone line
-        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
-            if stripped.lower().startswith("collecting environment information"):
-                continue
-            set_section(stripped)
-            continue
-
-        # env var style: KEY=VALUE (and not a URL with :)
-        if "=" in stripped and ":" not in stripped:
-            k, v = stripped.split("=", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-        # key: value
-        if ":" in stripped:
-            k, v = stripped.split(":", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
-
-
-def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
-    """Load vllm_env.txt next to the *original* input JSON file.
-
-    Note: when only one -f is provided, the script may split JSON into ./splits/...,
-    but vllm_env.txt typically lives next to the original benchmark_results.json.
-    """
-    base_dir: Path | None = None
-    if getattr(args, "file", None):
-        base_dir = Path(args.file[0]).resolve().parent
-    elif files:
-        base_dir = Path(files[0]).resolve().parent
-    if base_dir is None:
-        return None
-
-    env_path = base_dir / "vllm_env.txt"
-    if not env_path.exists():
-        return None
-    df = _parse_vllm_env_txt(env_path)
-    return df
-
-
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -556,11 +301,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:


 def _max_concurrency_ok(
-    df: pd.DataFrame,
-    conc_col: str,
-    cfg_col: str,
-    threshold: float,
-    slack_pct: float = 0.0,
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -573,14 +314,7 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA

-    # Accept values up to (1 + slack_pct%) above the SLA.
-    try:
-        slack_pct = float(slack_pct or 0.0)
-    except Exception:
-        slack_pct = 0.0
-    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
-
-    ok = d[d[cfg_col] <= effective_limit]
+    ok = d[d[cfg_col] <= threshold]
    if ok.empty:
        return pd.NA

@@ -646,25 +380,15 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

-    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
-    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
-    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
-    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
-    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
-
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(
-                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
-            )
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(
-                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
-            )
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
            if tpot_group_df is not None
            else pd.NA
        )
@@ -693,8 +417,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -704,6 +428,7 @@ def build_valid_max_concurrency_summary_html(

    summary_df = pd.DataFrame(rows)

+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -711,10 +436,12 @@ def build_valid_max_concurrency_summary_html(

    both_col = f"Max {conc_col} (Both)"

+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
+        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)
@@ -733,104 +460,6 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


-def build_valid_max_concurrency_summary_df(
-    tput_group_df: pd.DataFrame | None,
-    ttft_group_df: pd.DataFrame | None,
-    tpot_group_df: pd.DataFrame | None,
-    conc_col: str,
-    args,
-) -> pd.DataFrame | None:
-    if ttft_group_df is None and tpot_group_df is None:
-        return None
-
-    ttft_cols = (
-        _config_value_columns(ttft_group_df, conc_col)
-        if ttft_group_df is not None
-        else []
-    )
-    tpot_cols = (
-        _config_value_columns(tpot_group_df, conc_col)
-        if tpot_group_df is not None
-        else []
-    )
-    tput_cols = (
-        _config_value_columns(tput_group_df, conc_col)
-        if tput_group_df is not None
-        else []
-    )
-
-    if ttft_group_df is not None and tpot_group_df is not None:
-        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
-        if tput_group_df is not None:
-            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
-    else:
-        cfg_cols = ttft_cols or tpot_cols
-
-    if not cfg_cols:
-        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
-
-    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
-    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
-    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
-    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
-
-    rows = []
-    for cfg in cfg_cols:
-        ttft_max = (
-            _max_concurrency_ok(
-                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
-            )
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_max = (
-            _max_concurrency_ok(
-                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
-            )
-            if tpot_group_df is not None
-            else pd.NA
-        )
-        both = (
-            pd.NA
-            if (pd.isna(ttft_max) or pd.isna(tpot_max))
-            else min(ttft_max, tpot_max)
-        )
-
-        tput_at_both = (
-            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
-            if tput_group_df is not None
-            else pd.NA
-        )
-        ttft_at_both = (
-            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_at_both = (
-            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-
-        rows.append(
-            {
-                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
-                f"Max {conc_col} (Both)": both,
-                "Output Tput @ Both (tok/s)": tput_at_both,
-                "TTFT @ Both (ms)": ttft_at_both,
-                "TPOT @ Both (ms)": tpot_at_both,
-            }
-        )
-
-    df = pd.DataFrame(rows)
-    for c in df.columns:
-        if c != "Configuration":
-            df[c] = pd.to_numeric(df[c], errors="coerce")
-    return df
-
-
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -908,35 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
-
-    # ---- SLA tolerance (slack) options ----
-    parser.add_argument(
-        "--ttft-slack-pct",
-        type=float,
-        default=5.0,
-        help="Allowed percentage above TTFT SLA (default: 5).",
-    )
-    parser.add_argument(
-        "--tpot-slack-pct",
-        type=float,
-        default=5.0,
-        help="Allowed percentage above TPOT SLA (default: 5).",
-    )
-
-    # ---- export options ----
-    parser.add_argument(
-        "--excel-out",
-        type=str,
-        default="perf_comparison.xlsx",
-        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
-    )
-    parser.add_argument(
-        "--csv-out-dir",
-        type=str,
-        default="",
-        help="If set, write per-group per-metric CSVs into this directory.",
-    )
-
    return parser


@@ -1015,13 +615,9 @@ def render_metric_table_html(

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(
-            display_group, args.ttft_max_ms, args.ttft_slack_pct
-        )
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(
-            display_group, args.tpot_max_ms, args.tpot_slack_pct
-        )
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
    else:
        styler = display_group.style

@@ -1061,6 +657,7 @@ def maybe_write_plot(
        markers=True,
    )

+    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

@@ -1133,186 +730,87 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }

-    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
-    if csv_dir:
-        csv_dir.mkdir(parents=True, exist_ok=True)
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
+            )

-    excel_path = args.excel_out or "perf_comparison.xlsx"
-    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis

-    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
-    excel_engine = (
-        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
-    )
-    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
-        excel_engine = "openpyxl"
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]

-    excel_engine_kwargs = {}
-    if excel_engine == "xlsxwriter":
-        # Reduce memory pressure & usually faster writes.
-        excel_engine_kwargs = {"options": {"constant_memory": True}}
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )

-    xw_ctx = (
-        nullcontext(None)
-        if disable_excel
-        else pd.ExcelWriter(
-            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
-        )
-    )
-    with xw_ctx as xw:
-        used_sheets: set[str] = set()
-        # ---- Environment sheet (first) ----
-        env_sheet = _sanitize_sheet_name("Environment")
-        env_df = _load_env_df_for_inputs(args, files)
-        if xw is not None:
-            if env_df is None or env_df.empty:
-                pd.DataFrame(
-                    [
-                        {
-                            "Section": "Environment",
-                            "Key": "vllm_env.txt",
-                            "Value": "NOT FOUND (or empty)",
-                        }
-                    ]
-                ).to_excel(xw, sheet_name=env_sheet, index=False)
-            else:
-                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
-            used_sheets.add(env_sheet)
-        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-            main_fh.write('<meta charset="utf-8">\n')
-            for gkey in group_keys:
-                gkey_tuple = normalize_group_key(gkey)
-                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-                sub_path = group_filename(gkey_tuple)
-                group_header = (
-                    '<div style="font-size: 1.4em; font-weight: 700; '
-                    'margin: 18px 0 10px 0;">'
-                    f"{_html.escape(suffix)}"
-                    "</div>\n"
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
+                    )
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
+                    )
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
+                    )
+
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
                )
-
-                main_fh.write(group_header)
-
-                do_excel = xw is not None
-                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
-                sheet_base = sheet
-                if do_excel:
-                    dedup_i = 1
-                    while sheet in used_sheets:
-                        dedup_i += 1
-                        suffix = f"_{dedup_i}"
-                        # Ensure uniqueness even when sheet names are truncated.
-                        base = str(sheet_base)
-                        keep = max(1, 31 - len(suffix))
-                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
-                    used_sheets.add(sheet)
-
-                excel_blocks: list[tuple[str, pd.DataFrame]] = []
-
-                with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                    sub_fh.write('<meta charset="utf-8">\n')
-                    sub_fh.write(group_header)
-                    tput_group_df = None
-                    ttft_group_df = None
-                    tpot_group_df = None
-                    conc_col = args.xaxis
-
-                    for metric_label in plan.data_cols:
-                        gb = metric_groupbys[metric_label]
-                        df_sorted, raw_data_cols = metric_cache[metric_label]
-
-                        try:
-                            group_df = gb.get_group(gkey)
-                        except KeyError:
-                            missing = (
-                                '<div style="font-size: 1.1em; font-weight: 600; '
-                                'margin: 10px 0;">'
-                                f"{_html.escape(metric_label)} — missing for this group"
-                                "</div>\n"
-                            )
-                            main_fh.write(missing)
-                            sub_fh.write(missing)
-                            continue
-
-                        if conc_col not in group_df.columns:
-                            conc_col = _find_concurrency_col(group_df)
-
-                        mn = metric_label.lower().strip()
-                        if "tok/s" in mn:
-                            tput_group_df = group_df
-                        elif "ttft" in mn:
-                            ttft_group_df = group_df
-                        elif mn in ("p99", "median") or "tpot" in mn:
-                            tpot_group_df = group_df
-
-                        display_group = group_df.drop(
-                            columns=group_cols_canonical, errors="ignore"
-                        )
-
-                        html = render_metric_table_html(
-                            display_group, metric_label, suffix, args
-                        )
-                        main_fh.write(html)
-                        sub_fh.write(html)
-
-                        maybe_write_plot(
-                            main_fh,
-                            sub_fh,
-                            group_df=group_df,
-                            raw_data_cols=raw_data_cols,
-                            metric_label=metric_label,
-                            y_axis_col=y_axis_col,
-                            args=args,
-                        )
-
-                        excel_blocks.append(
-                            (metric_label, group_df.reset_index(drop=True))
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
-                                    "/", "_"
-                                )
-                            )
-                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                    summary_html = build_valid_max_concurrency_summary_html(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_html:
-                        main_fh.write(summary_html)
-                        sub_fh.write(summary_html)
-
-                    summary_df = build_valid_max_concurrency_summary_df(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_df is not None:
-                        excel_blocks.append(
-                            ("Valid Max Concurrency Summary", summary_df)
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__Valid_Max_Concurrency_Summary"
-                            )
-                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                if do_excel:
-                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
-
-    if disable_excel:
-        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
-    else:
-        print(f"Wrote Excel: {excel_path}")
-    if csv_dir:
-        print(f"Wrote CSVs under: {csv_dir}")
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)


 def main():
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            "performance-benchmarks-descriptions.md"
+            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+
+# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -7,26 +9,14 @@
 set -x
 set -o pipefail

-# Environment-driven debug controls (like ON_CPU=1)
-DRY_RUN="${DRY_RUN:-0}"
-MODEL_FILTER="${MODEL_FILTER:-}"
-DTYPE_FILTER="${DTYPE_FILTER:-}"
-
-# Adaptive search controls
-ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
-SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
-SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
-ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
-ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
-
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -35,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-
+  
  declare -g arch_suffix=''
-
+  
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -54,7 +44,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo "$numa_count"
+    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -122,12 +112,13 @@ json2envs() {
 }

 wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -sf http://localhost:8000/v1/models >/dev/null; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
-    done
-  '
+    done' && return 0 || return 1
 }

 kill_processes_launched_by_current_bash() {
@@ -190,318 +181,19 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-# -------------------------------
-# Adaptive concurrency helpers
-# -------------------------------
-result_json_path_for_serving() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency=$3
-  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
-}
+run_latency_tests() {
+  # run latency tests using `vllm bench latency` command
+  # $1: a json file specifying latency test cases

-extract_metric_ms() {
-  local metric_name=$1
-  local json_file=$2
+  local latency_test_file
+  latency_test_file=$1

-  [[ -f "$json_file" ]] || return 0
-
-  if [[ "$metric_name" == "ttft" ]]; then
-    jq -r '
-      [
-        .ttft_ms.p99?,
-        .metrics.ttft_ms.p99?,
-        .ttft.p99?,
-        .metrics.ttft.p99?,
-        .p99_ttft_ms?,
-        .ttft_ms.mean?,
-        .metrics.ttft_ms.mean?,
-        .ttft.mean?,
-        .metrics.ttft.mean?,
-        .mean_ttft_ms?
-      ] | map(select(. != null)) | .[0] // empty
-    ' "$json_file"
-  else
-    jq -r '
-      [
-        .tpot_ms.p99?,
-        .metrics.tpot_ms.p99?,
-        .tpot.p99?,
-        .metrics.tpot.p99?,
-        .p99_tpot_ms?,
-        .itl_ms.p99?,
-        .metrics.itl_ms.p99?,
-        .inter_token_latency_ms.p99?,
-        .tpot_ms.mean?,
-        .metrics.tpot_ms.mean?,
-        .tpot.mean?,
-        .metrics.tpot.mean?,
-        .itl_ms.mean?,
-        .metrics.itl_ms.mean?,
-        .mean_tpot_ms?,
-        .mean_itl_ms?
-      ] | map(select(. != null)) | .[0] // empty
-    ' "$json_file"
-  fi
-}
-
-evaluate_sla_from_json() {
-  local json_file=$1
-  local ttft
-  local tpot
-  local pass
-
-  [[ -f "$json_file" ]] || return 2
-
-  ttft=$(extract_metric_ms ttft "$json_file")
-  tpot=$(extract_metric_ms tpot "$json_file")
-
-  [[ -n "$ttft" && -n "$tpot" ]] || return 2
-
-  pass=$(jq -n \
-    --argjson ttft "$ttft" \
-    --argjson tpot "$tpot" \
-    --argjson sla_ttft "$SLA_TTFT_MS" \
-    --argjson sla_tpot "$SLA_TPOT_MS" \
-    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
-
-  [[ "$pass" == "true" ]]
-}
-
-write_adaptive_summary_json() {
-  local summary_file=$1
-  local test_name=$2
-  local qps=$3
-  local static_last_pass=$4
-  local static_first_fail=$5
-  local final_last_pass=$6
-  local final_first_fail=$7
-
-  jq -n \
-    --arg test_name "$test_name" \
-    --arg qps "$qps" \
-    --argjson sla_ttft "$SLA_TTFT_MS" \
-    --argjson sla_tpot "$SLA_TPOT_MS" \
-    --arg static_last_pass "${static_last_pass:-}" \
-    --arg static_first_fail "${static_first_fail:-}" \
-    --arg final_last_pass "${final_last_pass:-}" \
-    --arg final_first_fail "${final_first_fail:-}" \
-    '{
-      test_name: $test_name,
-      qps: $qps,
-      sla_ttft_ms: $sla_ttft,
-      sla_tpot_ms: $sla_tpot,
-      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
-      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
-      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
-      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
-    }' > "$summary_file"
-}
-
-run_single_serving_probe() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency=$3
-  local tp=$4
-  local compilation_config_mode=$5
-  local optimization_level=$6
-  local client_args_effective=$7
-  local client_remote_args=$8
-  local server_command=$9
-
-  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
-  local result_json
-  local num_prompts_arg=""
-  local client_command
-
-  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
-
-  if [[ -f "$result_json" ]]; then
-    evaluate_sla_from_json "$result_json"
-    return $?
-  fi
-
-  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
-    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
-    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
-    num_prompts_arg="--num-prompts $num_prompts"
-  fi
-
-  client_command="vllm bench serve \
-    --save-result \
-    --result-dir $RESULTS_FOLDER \
-    --result-filename ${new_test_name}.json \
-    --request-rate $qps \
-    --max-concurrency $max_concurrency \
-    $num_prompts_arg \
-    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
-    $client_args_effective $client_remote_args "
-
-  echo "Adaptive probe: $client_command"
-
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    bash -c "$client_command"
-  fi
-
-  jq_output=$(jq -n \
-    --arg server "$server_command" \
-    --arg client "$client_command" \
-    --arg gpu "$gpu_type" \
-    '{
-      server_command: $server,
-      client_command: $client,
-      gpu_type: $gpu,
-      adaptive_search: true
-    }')
-  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
-
-  evaluate_sla_from_json "$result_json"
-}
-
-adaptive_refine_from_static_results() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency_list_raw=$3
-  local tp=$4
-  local compilation_config_mode=$5
-  local optimization_level=$6
-  local client_args_effective=$7
-  local client_remote_args=$8
-  local server_command=$9
-
-  local sorted_points
-  local point
-  local rc
-  local static_last_pass=""
-  local static_first_fail=""
-  local largest_static=""
-  local step_hint=1
-  local previous_point=""
-  local low
-  local high
-  local mid
-  local probes=0
-  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
-
-  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
-  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
-
-  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
-  [[ -n "$sorted_points" ]] || return 0
-
-  while read -r point; do
-    [[ -z "$point" ]] && continue
-    largest_static="$point"
-    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
-    rc=$?
-    if (( rc == 0 )); then
-      static_last_pass="$point"
-    elif (( rc == 1 )); then
-      if [[ -n "$static_last_pass" ]]; then
-        static_first_fail="$point"
-        break
-      fi
-    fi
-
-    if [[ -n "$previous_point" ]]; then
-      step_hint=$(( point - previous_point ))
-      if (( step_hint < 1 )); then step_hint=1; fi
-    fi
-    previous_point="$point"
-  done <<< "$sorted_points"
-
-  if [[ -z "$static_last_pass" ]]; then
-    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
-    return 0
-  fi
-
-  if [[ -n "$static_first_fail" ]]; then
-    low=$static_last_pass
-    high=$static_first_fail
-    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
-      mid=$(( (low + high) / 2 ))
-      probes=$(( probes + 1 ))
-      run_single_serving_probe \
-        "$test_name" "$qps" "$mid" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
-      rc=$?
-      if (( rc == 0 )); then
-        low=$mid
-      elif (( rc == 1 )); then
-        high=$mid
-      else
-        break
-      fi
-    done
-    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
-    return 0
-  fi
-
-  low=$largest_static
-  high=""
-  while (( probes < ADAPTIVE_MAX_PROBES )); do
-    point=$(( low + step_hint ))
-    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
-      point=$ADAPTIVE_MAX_CONCURRENCY
-    fi
-    (( point > low )) || break
-    probes=$(( probes + 1 ))
-    run_single_serving_probe \
-      "$test_name" "$qps" "$point" "$tp" \
-      "$compilation_config_mode" "$optimization_level" \
-      "$client_args_effective" "$client_remote_args" "$server_command"
-    rc=$?
-    if (( rc == 0 )); then
-      low=$point
-      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
-      step_hint=$(( step_hint * 2 ))
-      if (( step_hint < 1 )); then step_hint=1; fi
-    elif (( rc == 1 )); then
-      high=$point
-      break
-    else
-      break
-    fi
-  done
-
-  if [[ -n "$high" ]]; then
-    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
-      mid=$(( (low + high) / 2 ))
-      probes=$(( probes + 1 ))
-      run_single_serving_probe \
-        "$test_name" "$qps" "$mid" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
-      rc=$?
-      if (( rc == 0 )); then
-        low=$mid
-      elif (( rc == 1 )); then
-        high=$mid
-      else
-        break
-      fi
-    done
-  fi
-
-  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
-}
-
-run_benchmark_tests() {
-  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: test type (latency or throughput)
-  # $2: a json file specifying test cases
-
-  local test_type=$1
-  local test_file=$2
-
-  # Iterate over tests
-  jq -c '.[]' "$test_file" | while read -r params; do
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi

@@ -512,15 +204,15 @@ run_benchmark_tests() {
    fi

    # get arguments
-    bench_params=$(echo "$params" | jq -r '.parameters')
-    bench_args=$(json2args "$bench_params")
-    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    bench_envs=$(json2envs "$bench_environment_variables")
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_envs=$(json2envs "$latency_environment_variables")

    # check if there is enough GPU to run the test
-    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -533,42 +225,118 @@ run_benchmark_tests() {
      fi
    fi

-    bench_command=" $bench_envs vllm bench $test_type \
+    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $bench_args"
+      $latency_args"

    echo "Running test case $test_name"
-    echo "${test_type^} command: $bench_command"
+    echo "Latency command: $latency_command"

-    # recording benchmarking command and GPU command
+    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
-      --arg command "$bench_command" \
+      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
-      --arg test_type "$test_type" \
      '{
-        ($test_type + "_command"): $command,
+        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
-    eval "$bench_command"
+    eval "$latency_command"

    kill_gpu_processes

  done
 }

-run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
+run_throughput_tests() {
+  # run throughput tests using `vllm bench throughput`
+  # $1: a json file specifying throughput test cases

-merge_serving_tests_stream() {
-  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-  local serving_test_file="$1"
-  # shellcheck disable=SC2016
-  local merged='
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    throughput_envs=$(json2envs "$throughput_environment_variables")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    throughput_command=" $throughput_envs vllm bench throughput \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -590,50 +358,7 @@ merge_serving_tests_stream() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  '
-
-  jq -c "$merged" "$serving_test_file" | \
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
-      select((($model|length)==0)
-             or ((.server_parameters.model // "") == $model)
-             or ((.client_parameters.model // "") == $model))
-      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
-    '
-  else
-    cat
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # In dry-run mode, if filters are provided but no tests match, fail fast.
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    local count
-    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
-    if [[ "$count" -eq 0 ]]; then
-      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
-      return 0
-    fi
-  fi
-
-  # Iterate over serving tests (merged + optional filtered stream)
-  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -652,48 +377,10 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')

-    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
-    server_model=$(echo "$server_params" | jq -r '.model // empty')
-    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
-      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
-      exit 1
-    fi
-    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
-    server_args=$(json2args "$server_params_no_model")
-
+    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")

-    # ------------------------------------------------------------
-    # Option 1: Dynamic num-prompts scaling based on max_concurrency
-    #
-    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
-    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
-    #
-    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
-    # unchanged (i.e., whatever is in serving-tests-*.json).
-    # ------------------------------------------------------------
-    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
-    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
-    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
-
-    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
-      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
-      # Handles: --num-prompts 123   and   --num-prompts=123
-      client_args_no_np="$(
-        printf ' %s ' "$client_args" \
-        | sed -E \
-          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
-      )"
-      # normalize whitespace
-      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
-      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
-      client_args_effective="$client_args_no_np"
-    else
-      client_args_effective="$client_args"
-    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +412,14 @@ run_serving_tests() {
    fi

    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

-    server_command="$server_envs vllm serve $server_model \
+    server_command="$server_envs vllm serve \
      $server_args"

    # run the server
@@ -739,7 +427,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
+    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -750,9 +438,6 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
-    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
-        # dry-run: don't start server
-        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -762,48 +447,34 @@ run_serving_tests() {
      fi
    fi

-    # save the compilation mode and optimization level on the serving results
-    # whenever they are set
-    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
-    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
-
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
        qps="inf"
+        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
-        num_prompts_arg=""
-        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
-          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
-          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
-          num_prompts_arg="--num-prompts $num_prompts"
-        fi
-        # pass the tensor parallel size, the compilation mode, and the optimization
-        # level to the client so that they can be used on the benchmark dashboard
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          $num_prompts_arg \
-          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args_effective $client_remote_args "
+          --metadata "tensor_parallel_size=$tp" \
+          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        if [[ "${DRY_RUN:-0}" != "1" ]]; then
-          bash -c "$client_command"
-        fi
+        bash -c "$client_command"

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -818,23 +489,15 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
-
-      adaptive_refine_from_static_results \
-        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
    done

    # clean up
-    if [[ "${DRY_RUN:-0}" != "1" ]]; then
-      kill -9 "$server_pid"
-      kill_gpu_processes
-    fi
+    kill -9 $server_pid
+    kill_gpu_processes
  done
 }

 main() {
-
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -844,13 +507,7 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-
-  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    check_hf_token
-  else
-    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
-  fi
+  check_hf_token

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -871,24 +528,17 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
+
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
-
-  if [[ "${DRY_RUN:-0}" == "1" ]]; then
-    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
-    exit 0
-  fi
-
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
-  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json

  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,56 +51,5 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
-    },
-    {
-        "test_name": "latency_deepseek_r1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "dtype": "bfloat16"
-        }
-    },
-    {
-        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "max-model-len": 512,
-            "max-num-seqs": 128,
-            "async-scheduling": "",
-            "gpu-memory-utilization": 0.95,
-            "enable_expert_parallel": ""
-        }
-    },
-    {
-        "test_name": "latency_qwen3_8b",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "Qwen/Qwen3-8B",
-            "tensor_parallel_size": 1,
-            "max-model-len": 2048,
-            "max-num-seqs": 128,
-            "dtype": "bfloat16",
-            "async-scheduling": ""
-        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -1,37 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "openai/whisper-large-v3-turbo"
-    },
-    "client_parameters": {
-      "model": "openai/whisper-large-v3-turbo",
-      "backend": "openai-audio",
-      "endpoint": "/v1/audio/transcriptions",
-      "dataset_name": "hf",
-      "dataset_path": "openslr/librispeech_asr",
-      "hf_subset": "clean",
-      "hf_split": "test",
-      "no_stream": "",
-      "no_oversample": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -1,41 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      32,
-      64,
-      128
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "jinaai/jina-embeddings-v3",
-      "trust_remote_code": ""
-    },
-    "client_parameters": {
-      "model": "jinaai/jina-embeddings-v3",
-      "backend": "openai-embeddings",
-      "endpoint": "/v1/embeddings",
-      "dataset_name": "sharegpt",
-      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -1,355 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int8_tp1_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int8_tp2_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int8_tp4_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,6 +72,17 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -94,6 +105,17 @@
        "random-output-len": 2048
      }
    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -117,25 +139,144 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
-        "tensor_parallel_size": 1
+        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 2048
+        "random-output-len": 128
      }
    },
    {
-      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,6 +10,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -36,6 +37,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -62,6 +64,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -75,83 +78,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
-    },
-    {
-        "test_name": "serving_deepseek_r1",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "disable_log_stats": "",
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "max-num-seqs": 200,
-            "async-scheduling": "",
-            "dtype": "bfloat16"
-        },
-        "client_parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "disable_log_stats": "",
-            "max-model-len": 2048,
-            "max-num-seqs": 128,
-            "async-scheduling": "",
-            "enable_expert_parallel": "",
-            "max-num-batched-tokens": 4096
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_qwen3_8b",
-        "qps_list": [1, 4, 10, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "tensor_parallel_size": 1,
-            "dtype": "bfloat16",
-            "disable_log_stats": "",
-            "async-scheduling": ""
-        },
-        "client_parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,6 +5,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -22,6 +23,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -39,6 +41,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -56,6 +59,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,67 +57,5 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
-    },
-    {
-        "test_name": "throughput_deepseek_r1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 384,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 512,
-            "async-scheduling": "",
-            "enable_expert_parallel": ""
-        }
-    },
-    {
-        "test_name": "throughput_qwen3_8b",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "max-num-seqs": 512,
-            "backend": "vllm",
-            "async-scheduling": ""
-        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,270 +1,277 @@
 steps:
+  # aarch64 + CUDA builds
+  - label: "Build wheel - aarch64 - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-12-9
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build wheel - aarch64 - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # aarch64 build
+  - label: "Build wheel - aarch64 - CPU"
+    depends_on: ~
+    id: build-wheel-arm64-cpu
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
+  - label: "Build wheel - x86_64 - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-x86-cuda-12-9
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build wheel - x86_64 - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-x86-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 CPU wheel build
+  - label: "Build wheel - x86_64 - CPU"
+    depends_on: ~
+    id: build-wheel-x86-cpu
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Build release images (CUDA 12.9)
+  - label: "Build release image - x86_64 - CUDA 12.9"
+    depends_on: ~
+    id: build-release-image-x86
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build release image - aarch64 - CUDA 12.9"
+    depends_on: ~
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  - label: "Create multi-arch manifest - CUDA 12.9"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Annotate release workflow - CUDA 12.9"
+    depends_on:
+      - create-multi-arch-manifest
+    id: annotate-release-workflow
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
+  - block: "Build CUDA 13.0 release images"
+    key: block-release-image-build-cuda-13-0
+    depends_on: ~
+
+  - label: "Build release image - x86_64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-x86-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+  - label: "Build release image - aarch64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+  - label: "Create multi-arch manifest - CUDA 13.0"
+    depends_on:
+      - build-release-image-x86-cuda-13-0
+      - build-release-image-arm64-cuda-13-0
+    id: create-multi-arch-manifest-cuda-13-0
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

-  - group: "Build Python wheels"
-    key: "build-wheels"
-    steps:
-      - label: "Build wheel - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-12-9
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+    key: block-upload-release-wheels
+    depends_on:
+      - input-release-version
+      - build-wheel-x86-cuda-12-9
+      - build-wheel-x86-cuda-13-0
+      - build-wheel-x86-cpu
+      - build-wheel-arm64-cuda-12-9
+      - build-wheel-arm64-cuda-13-0
+      - build-wheel-arm64-cpu

-      - label: "Build wheel - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - label: "Upload release wheels to PyPI and GitHub"
+    depends_on:
+      - block-upload-release-wheels
+    id: upload-release-wheels
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/upload-release-wheels.sh"

-      - label: "Build wheel - aarch64 - CPU"
-        depends_on: ~
-        id: build-wheel-arm64-cpu
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~

-      - label: "Build wheel - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-x86-cuda-12-9
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"

-      - label: "Build wheel - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - block: "Build arm64 CPU release image"
+    key: block-arm64-cpu-release-image-build
+    depends_on: ~

-      - label: "Build wheel - x86_64 - CPU"
-        depends_on: ~
-        id: build-wheel-x86-cpu
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
+  - label: "Build and publish arm64 CPU release image"
+    depends_on: block-arm64-cpu-release-image-build
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"

-  - group: "Build release Docker images"
-    key: "build-release-images"
-    steps:
-      - label: "Build release image - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-x86
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+  - block: "Build ROCm release image"
+    key: block-rocm-release-image-build
+    depends_on: ~

-      - label: "Build release image - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-arm64
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+  - label: "Build release image (ROCm)"
+    depends_on: block-rocm-release-image-build
+    id: build-release-image-rocm
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      # Build base image first
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+      # Build vLLM ROCm image using the base
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"

-      - label: "Build release image - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Build release image - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-
-      - block: "Build release image for x86_64 CPU"
-        key: block-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - x86_64 - CPU"
-        depends_on:
-          - block-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - block: "Build release image for arm64 CPU"
-        key: block-arm64-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - arm64 - CPU"
-        depends_on: 
-          - block-arm64-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-  - group: "Publish release images"
-    key: "publish-release-images"
-    steps:
-      - label: "Create multi-arch manifest - CUDA 12.9"
-        depends_on:
-          - build-release-image-x86
-          - build-release-image-arm64
-        id: create-multi-arch-manifest
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-      - label: "Annotate release workflow - CUDA 12.9"
-        depends_on:
-          - create-multi-arch-manifest
-        id: annotate-release-workflow
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/annotate-release.sh"
-
-      - label: "Create multi-arch manifest - CUDA 13.0"
-        depends_on:
-          - build-release-image-x86-cuda-13-0
-          - build-release-image-arm64-cuda-13-0
-        id: create-multi-arch-manifest-cuda-13-0
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Publish nightly multi-arch image to DockerHub"
-        depends_on:
-          - create-multi-arch-manifest
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
-        depends_on:
-          - create-multi-arch-manifest-cuda-13-0
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-  - group: "Publish wheels"
-    key: "publish-wheels"
-    steps:
-      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
-        key: block-upload-release-wheels
-        depends_on:
-          - input-release-version
-          - build-wheels
-
-      - label: "Upload release wheels to PyPI"
-        depends_on:
-          - block-upload-release-wheels
-        id: upload-release-wheels
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+  
+  - label: "Build and publish nightly multi-arch image to DockerHub"
+    depends_on:
+      - create-multi-arch-manifest
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-aarch64"
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest push vllm/vllm-openai:nightly"
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      # Clean up old nightly builds (keep only last 14)
+      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllmbot
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
+      DOCKERHUB_USERNAME: "vllmbot"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
@@ -459,7 +466,7 @@ steps:
      S3_BUCKET: "vllm-wheels"

  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel - x86_64"
+  - label: ":python: Build vLLM ROCm Wheel"
    id: build-rocm-vllm-wheel
    depends_on:
      - step: build-rocm-base-wheels
@@ -621,93 +628,9 @@ steps:
    depends_on:
      - step: upload-rocm-wheels
        allow_failure: true
-      - step: input-release-version
-        allow_failure: true
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-rocm-release.sh"
    env:
      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
-  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
-  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
-  - block: "Generate Root Index for ROCm Wheels for Release"
-    key: block-generate-root-index-rocm-wheels
-    depends_on: upload-rocm-wheels
-
-  - label: ":package: Generate Root Index for ROCm Wheels for Release"
-    depends_on: block-generate-root-index-rocm-wheels
-    id: generate-root-index-rocm-wheels
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
-    env:
-      S3_BUCKET: "vllm-wheels"
-      VARIANT: "rocm700"
-
-  # ROCm Job 5: Build ROCm Release Docker Image
-  - label: ":docker: Build release image - x86_64 - ROCm"
-    id: build-rocm-release-image
-    depends_on:
-      - step: build-rocm-base-wheels
-        allow_failure: false
-    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 60
-    commands:
-      - |
-        set -euo pipefail
-
-        # Login to ECR
-        aws ecr-public get-login-password --region us-east-1 | \
-          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-
-        # Download Docker image from S3 (set by build-rocm-base-wheels)
-        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
-        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
-          echo "ERROR: rocm-docker-image-s3-path metadata not found"
-          exit 1
-        fi
-
-        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
-        mkdir -p artifacts/rocm-docker-image
-        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
-
-        # Load base Docker image
-        echo "Loading base Docker image..."
-        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
-        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
-        echo "Loaded base image: $${BASE_IMAGE_TAG}"
-
-        # Tag and push the base image to ECR
-        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
-
-        # Get GPU architectures from meta-data
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        # Build vLLM ROCm release image using cached base
-        DOCKER_BUILDKIT=1 docker build \
-          --build-arg max_jobs=16 \
-          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
-          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-          --build-arg USE_SCCACHE=1 \
-          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-          --build-arg SCCACHE_REGION_NAME=us-west-2 \
-          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
-          --target vllm-openai \
-          --progress plain \
-          -f docker/Dockerfile.rocm .
-
-        # Push to ECR
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
-        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
-    env:
-      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,36 +11,28 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .

-(Optional) For CUDA 13.0:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
-
-(Optional) For CPU:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`

+To download the wheel (by version):
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
+\`\`\`

 To download and upload the image:

 \`\`\`
-# Download images:
-
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-
-# Tag and push images:
-
-## CUDA

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -48,70 +40,22 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
-docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
-docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
-docker push vllm/vllm-openai:latest-x86_64-cu130
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
-
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker push vllm/vllm-openai:latest-aarch64-cu130
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-
-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai:latest-rocm
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm

 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu130
-docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker manifest push vllm/vllm-openai:latest-cu130
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF
+EOF 
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -3,32 +3,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Generate Buildkite annotation for ROCm wheel release
+
 set -ex

 # Get build configuration from meta-data
 # Extract ROCm version dynamically from Dockerfile.rocm_base
-# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
 ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
 PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
 PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")

-# TODO: Enable the nightly build for ROCm
-# Get release version, default to 1.0.0.dev for nightly/per-commit builds
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
-if [ -z "${RELEASE_VERSION}" ]; then
-  RELEASE_VERSION="1.0.0.dev"
-fi
-
 # S3 URLs
 S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
 S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
-S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
+S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}"

-# Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
-ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
-## ROCm Wheel and Docker Image Releases
+## :rocm: ROCm Wheel Release
+
 ### Build Configuration
 | Setting | Value |
 |---------|-------|
@@ -41,72 +34,41 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
 ### :package: Installation

 **Install from this build (by commit):**
-
 \`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/

-# Example for ROCm ${ROCM_VERSION}:
-pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+# Example:
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
 \`\`\`

 **Install from nightly (if published):**
-
 \`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
 \`\`\`

 ### :floppy_disk: Download Wheels Directly

 \`\`\`bash
 # List all ROCm wheels
-aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
+aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
+
 # Download specific wheels
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
 \`\`\`

 ### :gear: Included Packages
 - **vllm**: vLLM with ROCm support
 - **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton
- **triton-kernels**: Triton kernels
+- **triton_rocm**: Triton built for ROCm
 - **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm

 ### :warning: Notes
 - These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
 - Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
 - Platform: Linux x86_64 only
-
-### :package: Docker Image Release
-
-To download and upload the image:
-
-\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-\`\`\`
-
 EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in

        echo ""
        echo "Downloaded wheels:"
-        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
+        ls -lh artifacts/rocm-base-wheels/

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -1,213 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Check if Ray LLM can generate lock files that are compatible with this
-# version of vllm. Downloads Ray's requirement files and runs a full
-# dependency resolution with the installed vllm's constraints to see if
-# a valid lock file can be produced.
-#
-# See: https://github.com/vllm-project/vllm/issues/33599
-
-set -eo pipefail
-
-RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
-
-WORK_DIR=$(mktemp -d)
-trap 'rm -rf "$WORK_DIR"' EXIT
-
-# Fetch all Ray requirement files used in the LLM depset pipeline
-echo ">>> Fetching Ray requirement files"
-RAY_FILES=(
-    "requirements.txt"
-    "requirements/cloud-requirements.txt"
-    "requirements/base-test-requirements.txt"
-    "requirements/llm/llm-requirements.txt"
-    "requirements/llm/llm-test-requirements.txt"
-)
-for FILE in "${RAY_FILES[@]}"; do
-    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
-    echo "    ${FILE}"
-    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
-done
-
-# Extract installed vllm deps
-echo ">>> Extracting installed vllm dependency constraints"
-python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
-"""Write out the installed vllm's dependencies as pip constraint lines.
-
-Ray uses vllm[audio], so audio-extra deps are included with their extra
-markers stripped. The resolver cannot evaluate extra markers for a
-package that is not itself being resolved from an index, so we activate
-them manually here.
-"""
-import importlib.metadata
-import re
-import sys
-
-out_path = sys.argv[1]
-raw_reqs = importlib.metadata.requires("vllm") or []
-
-# Ray uses vllm[audio] – activate that extra.
-ACTIVE_EXTRAS = {"audio"}
-EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
-
-lines = []
-for r in raw_reqs:
-    if ";" not in r:
-        # Unconditional dep — always include.
-        lines.append(r.strip())
-        continue
-
-    req_part, _, marker_part = r.partition(";")
-    marker_part = marker_part.strip()
-
-    extra_matches = EXTRA_RE.findall(marker_part)
-    if not extra_matches:
-        # Non-extra marker (python_version, etc.) — keep as-is.
-        lines.append(r.strip())
-        continue
-
-    if not ACTIVE_EXTRAS.intersection(extra_matches):
-        continue  # Skip inactive extras (tensorizer, bench, …).
-
-    # Strip the extra== conditions but keep any remaining markers
-    # (e.g. python_version).
-    cleaned = EXTRA_RE.sub("", marker_part)
-    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
-    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
-
-    if cleaned:
-        lines.append(f"{req_part.strip()} ; {cleaned}")
-    else:
-        lines.append(req_part.strip())
-
-with open(out_path, "w") as f:
-    for line in lines:
-        f.write(line + "\n")
-
-print(f"Wrote {len(lines)} constraints to {out_path}")
-PYEOF
-
-echo ">>> Installed vllm deps (first 20 lines):"
-head -20 "${WORK_DIR}/vllm-constraints.txt"
-
-# Remove Ray's vllm pin — the installed vllm's transitive deps
-# (written above) replace it in the resolution. vllm itself cannot
-# be resolved from PyPI for in-development versions, so we test
-# whether Ray's requirements can coexist with vllm's dependency
-# constraints instead.
-sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
-
-# Install uv if needed
-if ! command -v uv &>/dev/null; then
-    echo ">>> Installing uv"
-    pip install uv -q
-fi
-
-# Resolve: given vllm's constraints, can Ray compile a lock file?
-#
-# vllm's dependency constraints are the fixed side — Ray is flexible and
-# can regenerate its lock files. We pass vllm's constraints via -c so
-# the resolver treats them as non-negotiable bounds, then check whether
-# Ray's own requirements can still be satisfied within those bounds.
-echo ""
-echo "============================================================"
-echo ">>> Resolving: Can Ray generate compatible lock files?"
-echo "============================================================"
-
-set +e
-uv pip compile \
-    "${WORK_DIR}/requirements.txt" \
-    "${WORK_DIR}/cloud-requirements.txt" \
-    "${WORK_DIR}/base-test-requirements.txt" \
-    "${WORK_DIR}/llm-requirements.txt" \
-    "${WORK_DIR}/llm-test-requirements.txt" \
-    -c "${WORK_DIR}/vllm-constraints.txt" \
-    --python-version 3.12 \
-    --python-platform x86_64-manylinux_2_31 \
-    --extra-index-url https://download.pytorch.org/whl/cu129 \
-    --index-strategy unsafe-best-match \
-    --unsafe-package setuptools \
-    --unsafe-package ray \
-    --no-header \
-    -o "${WORK_DIR}/resolved.txt" \
-    2>&1
-EXIT_CODE=$?
-set -e
-
-echo ""
-echo "=========================================="
-if [ $EXIT_CODE -eq 0 ]; then
-    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
-    echo ""
-    echo "Key resolved versions:"
-    grep -E '^(protobuf|torch|numpy|transformers)==' \
-        "${WORK_DIR}/resolved.txt" | sort || true
-    echo "=========================================="
-    exit 0
-fi
-
-echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
-echo "This means a fundamental dependency conflict exists that Ray"
-echo "cannot resolve by regenerating its lock files."
-echo "See: https://github.com/vllm-project/vllm/issues/33599"
-echo "=========================================="
-
-# Buildkite annotation
-if [ -f /usr/bin/buildkite-agent ]; then
-    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
-### :warning: Ray Dependency Compatibility Warning
-This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
-Ray would not be able to regenerate its lock files to accommodate this vllm version.
-
-Please check the **Ray Dependency Compatibility Check** step logs for details.
-See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
-EOF
-fi
-
-# Notify Slack if webhook is configured and PR/branch are valid.
-if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
-    PR="${BUILDKITE_PULL_REQUEST:-}"
-    BRANCH="${BUILDKITE_BRANCH:-}"
-
-    # Skip notification if PR is invalid or branch is empty
-    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
-        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
-    else
-        echo ">>> Sending Slack notification"
-        # Single quotes are intentional: the f-string expressions are Python, not shell.
-        # shellcheck disable=SC2016
-        PAYLOAD=$(python3 -c '
-import json, os, sys
-pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
-branch = os.getenv("BUILDKITE_BRANCH", "unknown")
-url = os.getenv("BUILDKITE_BUILD_URL", "#")
-data = {
-    "text": ":warning: Ray Dependency Compatibility Check Failed",
-    "blocks": [{
-        "type": "section",
-        "text": {
-            "type": "mrkdwn",
-            "text": (
-                "*:warning: Ray Dependency Compatibility Check Failed*\n"
-                f"PR #{pr} on branch `{branch}` introduces dependencies "
-                f"that cannot be resolved with Ray'\''s requirements.\n"
-                f"<{url}|View Build>"
-            ),
-        },
-    }],
-}
-print(json.dumps(data))
-')
-
-        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-            -H 'Content-type: application/json' \
-            -d "$PAYLOAD")
-        echo "    Slack webhook response: $HTTP_CODE"
-    fi
-else
-    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
-fi
-
-exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -1,242 +0,0 @@
-#!/bin/bash
-#
-# cherry-pick-from-milestone.sh
-# Find commits from a GitHub milestone that are missing from the current branch
-# and output them in chronological order for cherry-picking.
-#
-# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
-#
-
-set -euo pipefail
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-usage() {
-    cat <<EOF
-Usage: $(basename "$0") <milestone> [options]
-
-Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
-
-Arguments:
-    milestone       The GitHub milestone name (e.g., v0.14.0)
-
-Options:
-    --dry-run       Show the cherry-pick commands without executing (default)
-    --execute       Actually execute the cherry-picks
-    --main-branch   Specify the main branch name (default: main)
-    --help          Show this help message
-
-Examples:
-    $(basename "$0") v0.14.0
-    $(basename "$0") v0.14.0 --dry-run
-    $(basename "$0") v0.14.0 --execute
-    $(basename "$0") v0.14.0 --main-branch master
-EOF
-    exit 1
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1" >&2
-}
-
-# Default values
-MILESTONE=""
-DRY_RUN=true
-MAIN_BRANCH="main"
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --dry-run)
-            DRY_RUN=true
-            shift
-            ;;
-        --execute)
-            DRY_RUN=false
-            shift
-            ;;
-        --main-branch)
-            MAIN_BRANCH="$2"
-            shift 2
-            ;;
-        --help|-h)
-            usage
-            ;;
-        -*)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-        *)
-            if [[ -z "$MILESTONE" ]]; then
-                MILESTONE="$1"
-            else
-                log_error "Unexpected argument: $1"
-                usage
-            fi
-            shift
-            ;;
-    esac
-done
-
-# Validate milestone argument
-if [[ -z "$MILESTONE" ]]; then
-    log_error "Milestone is required"
-    usage
-fi
-
-# Check if we're in a git repository
-if ! git rev-parse --is-inside-work-tree &>/dev/null; then
-    log_error "Not in a git repository"
-    exit 1
-fi
-
-# Check if gh CLI is available
-if ! command -v gh &>/dev/null; then
-    log_error "GitHub CLI (gh) is not installed"
-    exit 1
-fi
-
-# Check if authenticated with gh
-if ! gh auth status &>/dev/null; then
-    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
-    exit 1
-fi
-
-CURRENT_BRANCH=$(git branch --show-current)
-log_info "Current branch: ${CURRENT_BRANCH}"
-log_info "Main branch: ${MAIN_BRANCH}"
-log_info "Milestone: ${MILESTONE}"
-echo ""
-
-# Fetch latest from remote
-log_info "Fetching latest from remote..."
-git fetch origin "$MAIN_BRANCH" --quiet
-
-# Get merged PRs from the milestone, sorted by merge date
-log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
-
-# Store PR data in a temp file
-PR_DATA=$(mktemp)
-trap 'rm -f "$PR_DATA"' EXIT
-
-if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
-    --limit 1000 \
-    --json number,title,mergeCommit,mergedAt \
-    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
-    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
-    log_error "This could be due to:"
-    log_error "  - Milestone does not exist"
-    log_error "  - Network/authentication issues"
-    log_error "  - Invalid milestone name format"
-    exit 1
-fi
-
-if [[ ! -s "$PR_DATA" ]]; then
-    log_warn "No merged PRs found for milestone '${MILESTONE}'"
-    exit 0
-fi
-
-TOTAL_PRS=$(wc -l < "$PR_DATA")
-log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
-echo ""
-
-# Find commits that are missing from current branch
-MISSING_COMMITS=()
-MISSING_INFO=()
-
-while IFS=$'\t' read -r sha pr_number title; do
-    # Skip if SHA is empty or null
-    if [[ -z "$sha" || "$sha" == "null" ]]; then
-        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
-        continue
-    fi
-    
-    # Check if this commit is already in the current branch
-    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
-        log_success "PR #${pr_number} already in branch: ${title:0:60}"
-    else
-        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
-        MISSING_COMMITS+=("$sha")
-        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
-    fi
-done < "$PR_DATA"
-
-echo ""
-
-if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
-    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
-    exit 0
-fi
-
-log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
-echo ""
-
-# Output the cherry-pick commands
-echo "=========================================="
-echo "Cherry-pick commands (in chronological order):"
-echo "=========================================="
-echo ""
-
-for info in "${MISSING_INFO[@]}"; do
-    echo "# $info"
-done
-echo ""
-
-echo "# Run these commands to cherry-pick all missing commits:"
-echo "git cherry-pick ${MISSING_COMMITS[*]}"
-echo ""
-
-# Or one by one
-echo "# Or cherry-pick one at a time:"
-for sha in "${MISSING_COMMITS[@]}"; do
-    echo "git cherry-pick $sha"
-done
-echo ""
-
-# Execute if requested
-if [[ "$DRY_RUN" == false ]]; then
-    echo "=========================================="
-    log_info "Executing cherry-picks..."
-    echo "=========================================="
-    
-    for i in "${!MISSING_COMMITS[@]}"; do
-        sha="${MISSING_COMMITS[$i]}"
-        info="${MISSING_INFO[$i]}"
-        
-        echo ""
-        log_info "Cherry-picking: $info"
-        
-        if git cherry-pick "$sha"; then
-            log_success "Successfully cherry-picked $sha"
-        else
-            log_error "Failed to cherry-pick $sha"
-            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
-            exit 1
-        fi
-    done
-    
-    echo ""
-    log_success "All cherry-picks completed successfully!"
-else
-    echo "=========================================="
-    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
-    echo "Run with --execute to perform the cherry-picks"
-    echo "=========================================="
-fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,14 +3,7 @@
 set -ex

 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with specified prefix
-# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
-# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
-
-# Get tag prefix from argument, default to "nightly-" if not provided
-TAG_PREFIX="${1:-nightly-}"
-
-echo "Cleaning up tags with prefix: $TAG_PREFIX"
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix

 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -52,7 +45,7 @@ get_all_tags() {
        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:

 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant subdirectory.
+    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.

    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

-    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant subdirectory
+            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant subdirectory
+            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,57 +1,25 @@
 #!/bin/bash

-# This script runs tests inside the corresponding ROCm docker container.
-# It handles both single-node and multi-node test configurations.
-#
-# Multi-node detection: Instead of matching on fragile group names, we detect
-# multi-node jobs structurally by looking for the bracket command syntax
-# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
-#
-###############################################################################
-# QUOTING / COMMAND PASSING
-#
-# Passing commands as positional arguments ($*) is fragile when the command
-# string itself contains double quotes, e.g.:
-#
-#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
-#
-# The outer shell resolves the nested quotes *before* this script runs, so
-# the script receives mangled input it cannot fully recover.
-#
-# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
-#
-#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
-#   bash run-amd-test.sh
-#
-# Single-quoted assignment preserves all inner double quotes verbatim.
-# The $* path is kept for backward compatibility but callers should migrate.
-###############################################################################
+# This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

 # Export Python path
 export PYTHONPATH=".."

-###############################################################################
-# Helper Functions
-###############################################################################
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done

-wait_for_clean_gpus() {
-  local timeout=${1:-300}
-  local start=$SECONDS
-  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
-  while true; do
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-      echo "GPUs state is \"clean\""
-      return
-    fi
-    if (( SECONDS - start >= timeout )); then
-      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
-      exit 1
-    fi
-    sleep 3
-  done
-}
+echo "--- ROCm info"
+rocminfo

+# cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-
+  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -73,446 +44,201 @@ cleanup_docker() {
  fi
 }

-cleanup_network() {
-  local max_nodes=${NUM_NODES:-2}
-  for node in $(seq 0 $((max_nodes - 1))); do
-    if docker ps -a -q -f name="node${node}" | grep -q .; then
-      docker stop "node${node}" || true
-    fi
-  done
-  if docker network ls | grep -q docker-net; then
-    docker network rm docker-net || true
-  fi
-}
-
-is_multi_node() {
-  local cmds="$1"
-  # Primary signal: NUM_NODES environment variable set by the pipeline
-  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
-    return 0
-  fi
-  # Fallback: detect the bracket syntax structurally
-  # Pattern: [...] && [...] (per-node command arrays)
-  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
-    return 0
-  fi
-  return 1
-}
-
-handle_pytest_exit() {
-  local exit_code=$1
-  if [ "$exit_code" -eq 5 ]; then
-    echo "Pytest exit code 5 (no tests collected) - treating as success."
-    exit 0
-  fi
-  exit "$exit_code"
-}
-
-###############################################################################
-# Pytest marker/keyword re-quoting
-#
-# When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around multi-word pytest -m/-k expressions get stripped:
-#   pytest -v -s -m 'not cpu_test' v1/core
-# becomes:
-#   pytest -v -s -m not cpu_test v1/core
-#
-# pytest then interprets "cpu_test" as a file path, not part of the marker.
-#
-# This function detects unquoted expressions after -m/-k and re-quotes them
-# by collecting tokens until a recognizable boundary is reached:
-#   - test path (contains '/')
-#   - test file (ends with '.py')
-#   - another pytest flag (--xxx or -x single-char flags)
-#   - command separator (&& || ; |)
-#   - environment variable assignment (FOO=bar)
-#
-# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
-# unquoted since they have no spaces and work fine.
-#
-# Already-quoted expressions (containing literal single quotes) are passed
-# through untouched to avoid double-quoting values injected by
-# apply_rocm_test_overrides.
-#
-# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
-# double-quotes stripped by the calling shell (see header comment).
-# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
-###############################################################################
-re_quote_pytest_markers() {
-  local input="$1"
-  local output=""
-  local collecting=false
-  local marker_buf=""
-
-  # Strip backslash-newline continuations, then flatten remaining newlines
-  local flat="${input//$'\\\n'/ }"
-  flat="${flat//$'\n'/ }"
-
-  # Disable globbing to prevent *.py etc. from expanding during read -ra
-  local restore_glob
-  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
-  set -o noglob
-  local -a words
-  read -ra words <<< "$flat"
-  eval "$restore_glob"
-
-  for word in "${words[@]}"; do
-    if $collecting; then
-      # If the token we're about to collect already contains a literal
-      # single quote, the expression was already quoted upstream.
-      # Flush and stop collecting.
-      if [[ "$word" == *"'"* ]]; then
-        if [[ -n "$marker_buf" ]]; then
-          # Should not normally happen (partial buf + quote), flush raw
-          output+="${marker_buf} "
-          marker_buf=""
-        fi
-        output+="${word} "
-        collecting=false
-        continue
-      fi
-
-      local is_boundary=false
-      case "$word" in
-        # Line-continuation artifact
-        "\\")
-          is_boundary=true ;;
-        # Command separators
-        "&&"|"||"|";"|"|")
-          is_boundary=true ;;
-        # Long flags (--ignore, --shard-id, etc.)
-        --*)
-          is_boundary=true ;;
-        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
-        # like "not" which don't start with "-". Also skip -k/-m which
-        # would start a new marker (handled below).
-        -[a-zA-Z])
-          is_boundary=true ;;
-        # Test path (contains /)
-        */*)
-          is_boundary=true ;;
-        # Test file (ends with .py, possibly with ::method)
-        *.py|*.py::*)
-          is_boundary=true ;;
-        # Environment variable assignment preceding a command (FOO=bar)
-        *=*)
-          # Only treat as boundary if it looks like VAR=value, not
-          # pytest filter expressions like num_gpus=2 inside markers
-          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
-            is_boundary=true
-          fi
-          ;;
-      esac
-
-      if $is_boundary; then
-        # Strip surrounding double quotes if present (from upstream
-        # single-to-double conversion); without this, wrapping below
-        # would produce '"expr"' with literal double-quote characters.
-        if [[ "$marker_buf" == '"'*'"' ]]; then
-          marker_buf="${marker_buf#\"}"
-          marker_buf="${marker_buf%\"}"
-        fi
-        # Flush the collected marker expression
-        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
-          output+="'${marker_buf}' "
-        else
-          output+="${marker_buf} "
-        fi
-        collecting=false
-        marker_buf=""
-        # Check if this boundary word itself starts a new -m/-k
-        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
-          output+="${word} "
-          collecting=true
-        # Drop stray backslash tokens silently
-        elif [[ "$word" == "\\" ]]; then
-          :
-        else
-          output+="${word} "
-        fi
-      else
-        # Accumulate into marker buffer
-        if [[ -n "$marker_buf" ]]; then
-          marker_buf+=" ${word}"
-        else
-          marker_buf="${word}"
-        fi
-      fi
-    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
-      output+="${word} "
-      collecting=true
-      marker_buf=""
-    else
-      output+="${word} "
-    fi
-  done
-
-  # Flush any trailing marker expression (marker at end of command)
-  if $collecting && [[ -n "$marker_buf" ]]; then
-    # Strip surrounding double quotes (see mid-stream flush comment)
-    if [[ "$marker_buf" == '"'*'"' ]]; then
-      marker_buf="${marker_buf#\"}"
-      marker_buf="${marker_buf%\"}"
-    fi
-    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
-      output+="'${marker_buf}'"
-    else
-      output+="${marker_buf}"
-    fi
-  fi
-
-  echo "${output% }"
-}
-
-###############################################################################
-# ROCm-specific pytest command rewrites
-#
-# These apply ignore flags and environment overrides for tests that are not
-# yet supported or behave differently on ROCm hardware. Kept as a single
-# function so new exclusions are easy to add in one place.
-###############################################################################
-
-apply_rocm_test_overrides() {
-  local cmds="$1"
-
-  # --- Model registry filter ---
-  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
-    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-  fi
-
-  # --- LoRA: disable custom paged attention ---
-  if [[ $cmds == *"pytest -v -s lora"* ]]; then
-    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-  fi
-
-  # --- Kernel ignores ---
-  if [[ $cmds == *" kernels/core"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/core/test_fused_quant_layernorm.py \
-    --ignore=kernels/core/test_permute_cols.py"
-  fi
-
-  if [[ $cmds == *" kernels/attention"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/attention/test_attention_selector.py \
-    --ignore=kernels/attention/test_encoder_decoder_attn.py \
-    --ignore=kernels/attention/test_flash_attn.py \
-    --ignore=kernels/attention/test_flashinfer.py \
-    --ignore=kernels/attention/test_prefix_prefill.py \
-    --ignore=kernels/attention/test_cascade_flash_attn.py \
-    --ignore=kernels/attention/test_mha_attn.py \
-    --ignore=kernels/attention/test_lightning_attn.py \
-    --ignore=kernels/attention/test_attention.py"
-  fi
-
-  if [[ $cmds == *" kernels/quantization"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/quantization/test_int8_quant.py \
-    --ignore=kernels/quantization/test_machete_mm.py \
-    --ignore=kernels/quantization/test_block_fp8.py \
-    --ignore=kernels/quantization/test_block_int8.py \
-    --ignore=kernels/quantization/test_marlin_gemm.py \
-    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-    --ignore=kernels/quantization/test_int8_kernel.py"
-  fi
-
-  if [[ $cmds == *" kernels/mamba"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/mamba/test_mamba_mixer2.py \
-    --ignore=kernels/mamba/test_causal_conv1d.py \
-    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-  fi
-
-  if [[ $cmds == *" kernels/moe"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
-  fi
-
-  # --- Entrypoint ignores ---
-  if [[ $cmds == *" entrypoints/openai "* ]]; then
-    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/chat_completion/test_audio.py \
-    --ignore=entrypoints/openai/completion/test_shutdown.py \
-    --ignore=entrypoints/openai/test_completion.py \
-    --ignore=entrypoints/openai/test_models.py \
-    --ignore=entrypoints/openai/test_lora_adapters.py \
-    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
-    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
-  fi
-
-  if [[ $cmds == *" entrypoints/llm "* ]]; then
-    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
-    --ignore=entrypoints/llm/test_chat.py \
-    --ignore=entrypoints/llm/test_accuracy.py \
-    --ignore=entrypoints/llm/test_init.py \
-    --ignore=entrypoints/llm/test_prompt_validation.py "}
-  fi
-
-  # Clean up escaped newlines from --ignore appends
-  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
-
-  echo "$cmds"
-}
-
-###############################################################################
-# Main
-###############################################################################
-
-# --- GPU initialization ---
-echo "--- Confirming Clean Initial State"
-wait_for_clean_gpus
-
-echo "--- ROCm info"
-rocminfo
-
-# --- Docker housekeeping ---
+# Call the cleanup docker function
 cleanup_docker

 echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-wait_for_clean_gpus

-# --- Pull test image ---
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"

 remove_docker_container() {
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

-# --- Prepare commands ---
 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-# ---- Command source selection ----
-# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
-# Fall back to $* for backward compatibility, but warn that inner
-# double-quotes will have been stripped by the calling shell.
-if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
-  commands="${VLLM_TEST_COMMANDS}"
-  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
-else
-  commands="$*"
-  if [[ -z "$commands" ]]; then
-    echo "Error: No test commands provided." >&2
-    echo "Usage:" >&2
-    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
-    echo "  Legacy:     bash $0 \"commands here\"" >&2
-    exit 1
-  fi
-  echo "Commands sourced from positional args (legacy mode)"
-  echo "WARNING: Inner double-quotes in the command string may have been"
-  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
-  echo "  export VLLM_TEST_COMMANDS='your commands here'"
-  echo "  bash $0"
+commands=$@
+echo "Commands:$commands"
+
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-echo "Raw commands: $commands"
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}

-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-echo "After re-quoting: $commands"
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi

-commands=$(apply_rocm_test_overrides "$commands")
-echo "Final commands: $commands"
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi

+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

-# Verify GPU access
+# Test that we're launching on the machine that has
+# proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi

-# --- RDMA device passthrough (conditional) ---
-# If the host has RDMA devices, pass them through so tests like
-# test_moriio_connector can access ibverbs. On hosts without RDMA
-# hardware the tests will gracefully skip via _rdma_available().
-RDMA_FLAGS=""
-if [ -d /dev/infiniband ]; then
-  echo "RDMA devices detected on host, enabling passthrough"
-  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
-else
-  echo "No RDMA devices found on host, RDMA tests will be skipped"
-fi
-
-# --- Route: multi-node vs single-node ---
-if is_multi_node "$commands"; then
-  echo "--- Multi-node job detected"
-  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-
-  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-  #   BASH_REMATCH[1] = prefix (everything before first bracket)
-  #   BASH_REMATCH[2] = comma-separated node0 commands
-  #   BASH_REMATCH[3] = comma-separated node1 commands
-  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
-    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-    echo "PREFIX: ${prefix}"
-
-    export composite_command="(command rocm-smi || true)"
-    saved_IFS=$IFS
-    IFS=','
-    read -ra node0 <<< "${BASH_REMATCH[2]}"
-    read -ra node1 <<< "${BASH_REMATCH[3]}"
-    IFS=$saved_IFS
-
-    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
-      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  at_least_one_shard_with_tests=0
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    elif [[ ${st} -eq 5 ]]; then
+      echo "Shard exited with status 5 (no tests collected) - treating as success"
+    else # This means st is 0
+      at_least_one_shard_with_tests=1
    fi
-
-    for i in "${!node0[@]}"; do
-      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
-      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
-
-      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-      echo "COMMANDS: ${step_cmd}"
-      composite_command="${composite_command} && ${step_cmd}"
-    done
-
-    /bin/bash -c "${composite_command}"
-    exit_code=$?
-    cleanup_network
-    handle_pytest_exit "$exit_code"
-  else
-    echo "Multi-node job detected but failed to parse bracket command syntax."
-    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
-    echo "Got: $commands"
-    cleanup_network
-    exit 111
+  done
+  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+    echo "All shards reported no tests collected. Failing the build."
+    exit 1
  fi
 else
-  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    $RDMA_FLAGS \
-    --network=host \
-    --shm-size=16gb \
-    --group-add "$render_gid" \
-    --rm \
-    -e HF_TOKEN \
-    -e AWS_ACCESS_KEY_ID \
-    -e AWS_SECRET_ACCESS_KEY \
-    -e BUILDKITE_PARALLEL_JOB \
-    -e BUILDKITE_PARALLEL_JOB_COUNT \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e "HF_HOME=${HF_MOUNT}" \
-    -e "PYTHONPATH=${MYPYTHONPATH}" \
-    --name "${container_name}" \
-    "${image_name}" \
-    /bin/bash -c "${commands}"
-
-  exit_code=$?
-  handle_pytest_exit "$exit_code"
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -euox pipefail
-
-export VLLM_CPU_KVCACHE_SPACE=1 
-export VLLM_CPU_CI_ENV=1
-# Reduce sub-processes for acceleration
-export TORCH_COMPILE_DISABLE=1 
-export VLLM_ENABLE_V1_MULTIPROCESSING=0
-
-SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
-SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
-wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
-echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
-mkdir -p sde
-tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
-
-wait_for_pid_and_check_log() {
-    local pid="$1"
-    local log_file="$2"
-    local exit_status
-
-    if [ -z "$pid" ] || [ -z "$log_file" ]; then
-        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
-        return 1
-    fi
-
-    echo "Waiting for process $pid to finish..."
-    
-    # Use the 'wait' command to pause the script until the specific PID exits.
-    # The 'wait' command's own exit status will be that of the waited-for process.
-    if wait "$pid"; then
-        exit_status=$?
-        echo "Process $pid finished with exit status $exit_status (Success)."
-    else
-        exit_status=$?
-        echo "Process $pid finished with exit status $exit_status (Failure)."
-    fi
-
-    if [ "$exit_status" -ne 0 ]; then
-        echo "Process exited with a non-zero status."
-        echo "--- Last few lines of log file: $log_file ---"
-        tail -n 50 "$log_file"
-        echo "---------------------------------------------"
-        return 1 # Indicate failure based on exit status
-    fi
-
-    echo "No errors detected in log file and process exited successfully."
-    return 0
-}
-
-# Test Sky Lake (AVX512F)
-./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
-PID_TEST_0=$!
-
-# Test Cascade Lake (AVX512F + VNNI)
-./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
-PID_TEST_1=$!
-
-# Test Cooper Lake (AVX512F + VNNI + BF16)
-./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
-PID_TEST_2=$!
-
-wait_for_pid_and_check_log $PID_TEST_0 test_0.log
-wait_for_pid_and_check_log $PID_TEST_1 test_1.log
-wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,43 +0,0 @@
-#!/bin/bash
-set -euox pipefail
-export VLLM_CPU_CI_ENV=0
-
-echo "--- PP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename tp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/tp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
-
-echo "--- DP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename dp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/dp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,19 +2,119 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -euox pipefail
+set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+# used for TP/PP E2E test
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
-IMAGE_NAME="cpu-test-$NUMA_NODE"
-TIMEOUT_VAL=$1
-TEST_COMMAND=$2

-# building the docker image
-echo "--- :docker: Building Docker image"
-docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+    pytest -x -v -s tests/kernels/test_onednn.py"
+
+  # Run basic model test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    # Note: disable until supports V1
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
+
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/lora/test_qwenvl.py"
+
+  # online serving: tp+pp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,49 +1,21 @@
 #!/bin/bash

-# This script builds the HPU docker image and runs the offline inference inside the container.
+# This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-#
-# vllm-gaudi compatibility pinning:
-#   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
-#   When upstream vllm changes its API, the plugin may break before it has been updated.
-#   To handle this, the vllm-gaudi repository maintains a file:
-#     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
-#   The first line of that file controls what version of vllm is used inside the Docker image:
-#     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
-#     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
-#                         the test to a known-compatible baseline.
-#   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail

-# Fetch the vllm community commit reference from vllm-gaudi (first line only).
-VLLM_COMMUNITY_COMMIT=$(curl -s \
-  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
-  | head -1 | tr -d '\n')
-
-echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
-
 # Try building the docker image
-image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
-container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t "${image_name}" -f - .
+cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

-# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
-# to the version known to be compatible with vllm-gaudi. When the value is "latest",
-# the current checkout (the Buildkite CI commit) is used unchanged.
-RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
-      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
-    fi
-
 WORKDIR /workspace/vllm

 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
-RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
+RUN VLLM_TARGET_DEVICE=empty pip install .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -64,20 +36,15 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f "${container_name}" || true; }
+remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name="${container_name}" --network=host \
+docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  -e VLLM_SKIP_WARMUP=true \
-  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-  -e PT_HPU_LAZY_MODE=1 \
-  "${image_name}" \
-  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
-'
+  hpu-plugin-v1-test-env \
+  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,7 +41,6 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
-    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -49,8 +48,9 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
+get_config
 # Check if the function call was successful. If not, exit the script.
-if ! get_config; then
+if [ $? -ne 0 ]; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p "${builder_cache_dir}"
+mkdir -p ${builder_cache_dir}

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t "${image_name}" -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns one argument per line: --device, /dev/davinciX, ...
+#   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
        return 1
    fi

+    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        printf '%s\n' "--device"
-        printf '%s\n' "/dev/davinci${dev_idx}"
+        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
 }

-mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p "${model_cache_dir}"
+mkdir -p ${model_cache_dir}
 docker run \
-    "${device_args[@]}" \
+    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v "${model_cache_dir}":/root/.cache/modelscope \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t "${image_name}" -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
@@ -34,22 +34,19 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
+    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
-# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
-# otherwise they will be cleaned up together with the main "nightly" tags.
-
-TAG_VARIANT="$1"
-if [ -n "$TAG_VARIANT" ]; then
-    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
-    TAG_NAME="$TAG_VARIANT-nightly"
-else
-    ORIG_TAG_SUFFIX=""
-    TAG_NAME="nightly"
-fi
-
-ORIG_TAG_NAME="$BUILDKITE_COMMIT"
-
-echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
-
-# pull original arch-dependent images from AWS ECR Public
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
-# tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
-# push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
-# push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,6 +43,7 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -51,14 +52,13 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend "$BACK" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# Nightly e2e test for prefetch offloading with a MoE model.
-# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
-# and validates GSM8K accuracy matches baseline (no offloading).
-#
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-Lite"
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-vllm serve "$MODEL" \
-  --max-model-len 2048 \
-  --offload-group-size 8 \
-  --offload-num-in-group 2 \
-  --offload-prefetch-step 1 \
-  --offload-params w13_weight w2_weight \
-  --port "$PORT" &
-SERVER_PID=$!
-wait_for_server "$PORT"
-
-TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
-python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
-python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
-PY
-
-cleanup
-SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -18,18 +18,15 @@ wait_for_server() {

 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

-# Set BACKENDS and platform-specific args based on platform
+# Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
-  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-  PLATFORM_ARGS=()
 fi

 cleanup() {
@@ -51,20 +48,19 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
-    "${PLATFORM_ARGS[@]}" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -1,248 +0,0 @@
-#!/bin/bash
-# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
-# evaluation against a local vLLM server.
-#
-# Usage:
-#   # Run with defaults (gpt-oss-20b, multi_turn)
-#   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-#   # Run with gpt-oss-120b and multiple test categories
-#   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
-#     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-#   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
-#   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
-#   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-# Environment variables (all optional, with defaults):
-#   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
-#   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
-#   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
-#   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
-#   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
-#   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
-#   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
-#   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
-#   BFCL_PORT           - Server port (default: 8000)
-#   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
-#   BFCL_EXTRA_ARGS     - Additional vLLM server args
-
-set -euo pipefail
-
-# ---- Configuration ----
-MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
-API_TYPE="${BFCL_API_TYPE:-chat_completions}"
-OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
-TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
-TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
-NUM_THREADS="${BFCL_NUM_THREADS:-8}"
-TP_SIZE="${BFCL_TP_SIZE:-1}"
-MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
-PORT="${BFCL_PORT:-8000}"
-REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
-EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
-
-# Set up output directory
-if [ -n "$OUTPUT_DIR" ]; then
-    mkdir -p "$OUTPUT_DIR"
-    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
-fi
-
-echo "============================================"
-echo "BFCL Tool Call Correctness Evaluation"
-echo "============================================"
-echo "Model:          $MODEL"
-echo "Tool parser:    $TOOL_CALL_PARSER"
-echo "API type:       $API_TYPE"
-echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
-echo "Test category:  $TEST_CATEGORY"
-echo "TP size:        $TP_SIZE"
-echo "Max model len:  $MAX_MODEL_LEN"
-echo "Port:           $PORT"
-echo "Num threads:    $NUM_THREADS"
-echo "============================================"
-
-# ---- Install bfcl-eval if missing ----
-if ! python3 -c "import bfcl_eval" 2>/dev/null; then
-    echo "Installing bfcl-eval..."
-    pip install "bfcl-eval>=2025.10.20.1,<2026"
-fi
-
-# ---- Cleanup handler ----
-SERVER_PID=""
-cleanup() {
-    if [ -n "$SERVER_PID" ]; then
-        echo "Stopping vLLM server (pid=$SERVER_PID)..."
-        kill "$SERVER_PID" 2>/dev/null || true
-        wait "$SERVER_PID" 2>/dev/null || true
-    fi
-    # Remove BFCL lock files (created by filelock for thread-safe writes)
-    rm -rf .file_locks/
-    if [ -n "${OUTPUT_DIR:-}" ]; then
-        rm -rf "$OUTPUT_DIR/.file_locks/"
-    fi
-}
-trap cleanup EXIT
-
-# ---- Start vLLM server ----
-echo "Starting vLLM server..."
-
-SERVE_ARGS=(
-    "$MODEL"
-    --port "$PORT"
-    --enable-auto-tool-choice
-    --tool-call-parser "$TOOL_CALL_PARSER"
-    --tensor-parallel-size "$TP_SIZE"
-    --max-model-len "$MAX_MODEL_LEN"
-    --enforce-eager
-    --no-enable-prefix-caching
-)
-
-# Append reasoning parser if specified
-if [ -n "$REASONING_PARSER" ]; then
-    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
-fi
-
-# Append any extra args
-if [ -n "$EXTRA_ARGS" ]; then
-    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
-    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
-fi
-
-echo "Command: vllm serve ${SERVE_ARGS[*]}"
-vllm serve "${SERVE_ARGS[@]}" &
-SERVER_PID=$!
-
-# ---- Wait for server to be ready ----
-echo "Waiting for vLLM server to start (timeout: 600s)..."
-SECONDS_WAITED=0
-until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
-    if [ $SECONDS_WAITED -ge 600 ]; then
-        echo ""
-        echo "ERROR: vLLM server failed to start within 600s"
-        exit 1
-    fi
-    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
-        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
-    fi
-    sleep 2
-    SECONDS_WAITED=$((SECONDS_WAITED + 2))
-done
-echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
-
-# ---- Run BFCL evaluation ----
-# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
-# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
-# be patched in-process so BFCL knows to use the OpenAI-compatible handler
-# against our local vLLM server.
-bfcl_exit_code=0
-python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
-import os
-import sys
-
-model = sys.argv[1]
-test_category = sys.argv[2]
-num_threads = int(sys.argv[3])
-port = sys.argv[4]
-api_type = sys.argv[5]
-output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
-
-os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
-os.environ["OPENAI_API_KEY"] = "dummy"
-os.environ["BFCL_PROJECT_ROOT"] = output_dir
-
-import bfcl_eval.constants.model_config as bfcl_model_config
-from bfcl_eval.constants.model_config import ModelConfig
-from bfcl_eval.model_handler.api_inference.openai_completion import (
-    OpenAICompletionsHandler,
-)
-from bfcl_eval.model_handler.api_inference.openai_response import (
-    OpenAIResponsesHandler,
-)
-
-if api_type == "responses":
-    handler = OpenAIResponsesHandler
-else:
-    handler = OpenAICompletionsHandler
-
-bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
-    model_name=model,
-    display_name=f"{model} (FC) (vLLM)",
-    url=f"https://huggingface.co/{model}",
-    org="",
-    license="apache-2.0",
-    model_handler=handler,
-    input_price=None,
-    output_price=None,
-    is_fc_model=True,
-    underscore_to_dot=True,
-)
-
-from bfcl_eval.__main__ import evaluate, generate
-import inspect
-import typer
-
-
-def _get_default_kwargs(function):
-    kwargs = {}
-    for k, v in inspect.signature(function).parameters.items():
-        if v.default is not inspect.Parameter.empty:
-            default = v.default
-            if isinstance(default, typer.models.OptionInfo):
-                default = default.default
-            kwargs[k] = default
-    return kwargs
-
-
-# ---- generate ----
-print(f"=== BFCL generate: model={model} test_category={test_category} ===")
-gen_kwargs = _get_default_kwargs(generate)
-gen_kwargs["model"] = [model]
-gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
-gen_kwargs["skip_server_setup"] = True
-gen_kwargs["num_threads"] = num_threads
-generate(**gen_kwargs)
-
-# ---- evaluate ----
-print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
-eval_kwargs = _get_default_kwargs(evaluate)
-eval_kwargs["model"] = [model]
-eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
-evaluate(**eval_kwargs)
-
-print("=== BFCL evaluation completed successfully ===")
-PYEOF
-
-# ---- Upload results to buildkite ----
-if command -v buildkite-agent &>/dev/null; then
-    if [ $bfcl_exit_code -eq 0 ]; then
-        STYLE="success"
-        STATUS="PASSED"
-    else
-        STYLE="error"
-        STATUS="FAILED"
-    fi
-
-    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
-### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
-EOF
-
-    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
-    # $BFCL_PROJECT_ROOT/score/
-    RESULTS_ROOT="${OUTPUT_DIR:-.}"
-    if [ -d "$RESULTS_ROOT/result" ]; then
-        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
-    fi
-    if [ -d "$RESULTS_ROOT/score" ]; then
-        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
-    fi
-fi
-
-exit $bfcl_exit_code
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,11 +9,10 @@ ENV_FILE=$1

 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-# shellcheck source=/dev/null
-source "$ENV_FILE"
+source $ENV_FILE

 remove_docker_container() { 
-    docker rm -f "$CONTAINER_NAME" || true;
+    docker rm -f $CONTAINER_NAME || true;
 }

 trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file "$ENV_FILE" \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL="$MODEL" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
- --name "$CONTAINER_NAME" \
+ --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-vllm serve "$MODEL" \
+vllm serve $MODEL \
 --seed 42 \
- --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
- --download_dir "$DOWNLOAD_DIR" \
- --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &


 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for _ in {1..120}; do
+for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model "$MODEL"  \
+    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len "$OUTPUT_LEN" \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"

 echo "completed..."
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -1,227 +0,0 @@
-#!/bin/bash
-#
-# trigger-ci-build.sh
-# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
-# with RUN_ALL=1 and NIGHTLY=1 environment variables.
-#
-# Usage: ./trigger-ci-build.sh [options]
-#
-# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
-#
-# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
-#
-
-set -euo pipefail
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-# Default configuration
-PIPELINE="ci"
-DRY_RUN=true
-
-usage() {
-    cat <<EOF
-Usage: $(basename "$0") [options]
-
-Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
-Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
-
-SAFETY: Dry-run by default. Use --execute to actually trigger a build.
-
-Options:
-    --execute       Actually trigger the build (default: dry-run)
-    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
-    --commit        Override commit SHA (default: current HEAD)
-    --branch        Override branch name (default: current branch)
-    --message       Custom build message (default: auto-generated)
-    --help          Show this help message
-
-Prerequisites:
-    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
-    - bk configured: bk configure
-
-Examples:
-    $(basename "$0")                        # Dry-run, show what would happen
-    $(basename "$0") --execute              # Actually trigger the build
-    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
-EOF
-    exit 1
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1" >&2
-}
-
-# Parse arguments
-COMMIT=""
-BRANCH=""
-MESSAGE=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --execute)
-            DRY_RUN=false
-            shift
-            ;;
-        --pipeline)
-            PIPELINE="$2"
-            shift 2
-            ;;
-        --commit)
-            COMMIT="$2"
-            shift 2
-            ;;
-        --branch)
-            BRANCH="$2"
-            shift 2
-            ;;
-        --message)
-            MESSAGE="$2"
-            shift 2
-            ;;
-        --help|-h)
-            usage
-            ;;
-        -*)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-        *)
-            log_error "Unexpected argument: $1"
-            usage
-            ;;
-    esac
-done
-
-# Check if bk CLI is installed
-if ! command -v bk &>/dev/null; then
-    log_error "Buildkite CLI (bk) is not installed"
-    echo ""
-    echo "Install with:"
-    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
-    echo ""
-    echo "Then configure:"
-    echo "  bk configure"
-    exit 1
-fi
-
-# Check if we're in a git repository
-if ! git rev-parse --is-inside-work-tree &>/dev/null; then
-    log_error "Not in a git repository"
-    exit 1
-fi
-
-# Get current commit and branch if not overridden
-if [[ -z "$COMMIT" ]]; then
-    COMMIT=$(git rev-parse HEAD)
-fi
-
-if [[ -z "$BRANCH" ]]; then
-    BRANCH=$(git branch --show-current)
-    if [[ -z "$BRANCH" ]]; then
-        # Detached HEAD state - try to get branch from ref
-        BRANCH=$(git rev-parse --abbrev-ref HEAD)
-    fi
-fi
-
-# Generate default message if not provided
-if [[ -z "$MESSAGE" ]]; then
-    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
-    MESSAGE="[Manual] ${COMMIT_MSG}"
-fi
-
-# Safety check: Verify the commit exists on the remote
-log_info "Verifying commit exists on remote..."
-git fetch origin --quiet 2>/dev/null || true
-
-# Check if commit is reachable from any remote branch
-REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
-if [[ -z "$REMOTE_BRANCHES" ]]; then
-    log_error "Commit ${COMMIT} does not exist on any remote branch!"
-    echo ""
-    echo "The CI system will fail to checkout this commit."
-    echo "Please push your changes first:"
-    echo ""
-    echo "  git push origin ${BRANCH}"
-    echo ""
-    exit 1
-fi
-
-log_success "Commit found on remote branches:"
-echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
-if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
-    echo "  ... and more"
-fi
-echo ""
-
-log_info "Pipeline: ${PIPELINE}"
-log_info "Branch: ${BRANCH}"
-log_info "Commit: ${COMMIT}"
-log_info "Message: ${MESSAGE}"
-log_info "Environment: RUN_ALL=1, NIGHTLY=1"
-echo ""
-
-# Build the command
-CMD=(bk build create
-    -y
-    -w
-    -i
-    --pipeline "${PIPELINE}"
-    --commit "${COMMIT}"
-    --branch "${BRANCH}"
-    --message "${MESSAGE}"
-    --env "RUN_ALL=1"
-    --env "NIGHTLY=1"
-)
-
-if [[ "$DRY_RUN" == true ]]; then
-    echo "=========================================="
-    log_warn "DRY-RUN MODE - No build will be triggered"
-    echo "=========================================="
-    echo ""
-    echo "Command that would be executed:"
-    echo ""
-    # Escape single quotes in values for safe shell display
-    escape_for_shell() {
-        printf '%s' "$1" | sed "s/'/'\\\\''/g"
-    }
-    echo "  bk build create \\"
-    echo "    -y \\"
-    echo "    -w \\"
-    echo "    -i \\"
-    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
-    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
-    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
-    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
-    echo "    --env 'RUN_ALL=1' \\"
-    echo "    --env 'NIGHTLY=1'"
-    echo ""
-    echo "=========================================="
-    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
-    echo ""
-    echo "  $0 --execute"
-    echo "=========================================="
-    exit 0
-fi
-
-log_info "Triggering build..."
-
-# Execute the command - bk will print the URL and open browser
-"${CMD[@]}"
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,19 +72,20 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"

-# call script to generate indices for all existing wheels
+# call script to generate indicies for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-alias_args=()
-if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+    alias_arg=""
 fi

 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg

 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-BUCKET="vllm-wheels"
-SUBPATH=$BUILDKITE_COMMIT
-S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
-
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
-
-echo "Release version from Buildkite: $RELEASE_VERSION"
-
-if [[ -z "$GIT_VERSION" ]]; then
-    echo "[FATAL] Not on a git tag, cannot create release."
-    exit 1
-else
-    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
-fi
-# sanity check for version mismatch
-if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
-  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
-    echo "[WARNING] Force release and ignore version mismatch"
-  else
-    echo "[FATAL] Release version from Buildkite does not match Git version."
-    exit 1
-  fi
-fi
-PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
-
-# check pypi token
-if [[ -z "$PYPI_TOKEN" ]]; then
-  echo "[FATAL] PYPI_TOKEN is not set."
-  exit 1
-else
-  export TWINE_USERNAME="__token__"
-  export TWINE_PASSWORD="$PYPI_TOKEN"
-fi
-
-set -x # avoid printing secrets above
-
-# install twine from pypi
-python3 -m venv /tmp/vllm-release-env
-source /tmp/vllm-release-env/bin/activate
-pip install twine
-python3 -m twine --version
-
-# copy release wheels to local directory
-DIST_DIR=/tmp/vllm-release-dist
-echo "Existing wheels on S3:"
-aws s3 ls "$S3_COMMIT_PREFIX"
-echo "Copying wheels to local directory"
-mkdir -p $DIST_DIR
-# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
-aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
-echo "Wheels copied to local directory"
-# generate source distribution using setup.py
-python setup.py sdist --dist-dir=$DIST_DIR
-ls -la $DIST_DIR
-
-SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
-echo "Found sdist: $SDIST_FILE"
-
-# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
-PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [[ -z "$PYPI_WHEEL_FILES" ]]; then
-  echo "No default variant wheels found, quitting..."
-  exit 1
-fi
-
-python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/scripts/upload-release-wheels.sh
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+echo "Release version from Buildkite: $RELEASE_VERSION"
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+if [ -z "$GIT_VERSION" ]; then
+    echo "[FATAL] Not on a git tag, cannot create release."
+    exit 1
+else
+    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+    echo "[WARNING] Force release and ignore version mismatch"
+  else
+    echo "[FATAL] Release version from Buildkite does not match Git version."
+    exit 1
+  fi
+fi
+
+# check pypi token
+if [ -z "$PYPI_TOKEN" ]; then
+  echo "[FATAL] PYPI_TOKEN is not set."
+  exit 1
+else
+  export TWINE_USERNAME="__token__"
+  export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+# check github token
+if [ -z "$GITHUB_TOKEN" ]; then
+  echo "[FATAL] GITHUB_TOKEN is not set."
+  exit 1
+else
+  export GH_TOKEN="$GITHUB_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# download gh CLI from github
+# Get latest gh CLI version from GitHub API
+GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
+if [ -z "$GH_VERSION" ]; then
+  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
+  exit 1
+fi
+echo "Downloading gh CLI version: $GH_VERSION"
+GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
+GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
+GH_INSTALL_DIR="/tmp/gh-install"
+mkdir -p "$GH_INSTALL_DIR"
+pushd "$GH_INSTALL_DIR"
+curl -L -o "$GH_TARBALL" "$GH_URL"
+tar -xzf "$GH_TARBALL"
+GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
+if [ -z "$GH_BIN" ]; then
+  echo "[FATAL] Failed to find gh CLI executable"
+  exit 1
+fi
+echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
+echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
+command "$GH_BIN" release list --limit 5
+popd
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
+aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
+if [ -z "$PYPI_WHEEL_FILES" ]; then
+  echo "No default variant wheels found, quitting..."
+  exit 1
+fi
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
+
+# create release on GitHub with the release version and all wheels
+command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true

-WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"

 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi

 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,10 +4,8 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  device: h100
+  gpu: h100
  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -15,11 +13,9 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  device: b200
+  gpu: b200
  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,15 +17,3 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
-
- label: Attention Benchmarks Smoke Test (B200)
-  device: b200
-  num_gpus: 2
-  optional: true
-  working_dir: "/vllm-workspace/"
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - benchmarks/attention_benchmarks/
-  - vllm/v1/attention/
-  commands:
-  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,210 +2,56 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
-  timeout_in_minutes: 50
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/model_executor/layers/
-  - vllm/compilation/
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - tests/compile/correctness_e2e/test_sequence_parallel.py
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: Sequence Parallel Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: AsyncTP Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
-
- label: AsyncTP Correctness Tests (B200)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
-
- label: Distributed Compile Unit Tests (2xH100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/compilation/
-  - vllm/model_executor/layers
-  - tests/compile/passes/distributed/
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -s -v tests/compile/passes/distributed
-
- label: Fusion and Compile Unit Tests (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
-    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E Quick (H100)
-  timeout_in_minutes: 15
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
-
- label: Fusion E2E Config Sweep (H100)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
-
- label: Fusion E2E Config Sweep (B200)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 1
+  gpu: b200
  optional: true
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
-
- label: Fusion E2E TP2 Quick (H100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
-
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
-
- label: Fusion E2E TP2 (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,7 +9,6 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -16,9 +16,9 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 60
+  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -47,21 +47,26 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Torchrun + Examples (4 GPUs)
-  timeout_in_minutes: 30
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_torchrun_example.py
-  - tests/distributed/test_torchrun_example_moe.py
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,27 +84,6 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  # OLD rlhf examples
-  - cd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  # NEW rlhf examples
-  - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-
- label: Distributed DP Tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_utils
-  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -107,32 +91,20 @@ steps:
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-
- label: Distributed Compile + Comm (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
-  - tests/distributed/test_symm_mem_allreduce.py
-  - tests/distributed/test_multiproc_executor.py
-  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # test multi-node TP with multiproc executor (simulated on single node)
-  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  device: h100
-  num_devices: 8
+  gpu: h100
+  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -148,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -161,23 +133,26 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H100)
-  timeout_in_minutes: 15
-  device: h100
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  device: b200
+  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -186,10 +161,8 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  num_nodes: 2
-  no_plugin: true
-  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -198,12 +171,12 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"

 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -211,45 +184,10 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
-  timeout_in_minutes: 30
-  device: a100
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - vllm/v1/worker/kv_connector_model_runner_mixin.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
-
- label: Pipeline + Context Parallelism (4 GPUs)
+- label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -258,4 +196,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,36 +4,39 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  device: b200
+  gpu: b200
  optional: true
-  num_devices: 2
+  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 60
-  device: h100
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
  optional: true
-  num_devices: 1
+  soft_fail: true
+  num_gpus: 2
  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,5 +1,5 @@
 group: Engine
-depends_on:
+depends_on: 
  - image-build
 steps:
 - label: Engine
@@ -14,59 +14,13 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: Engine (1 GPU)
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/engine/
-    - tests/v1/engine/
-  commands:
-    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-
- label: e2e Scheduling (1 GPU)
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/
-    - tests/v1/e2e/general/
-  commands:
-    - pytest -v -s v1/e2e/general/test_async_scheduling.py
-
- label: e2e Core (1 GPU)
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/
-    - tests/v1/e2e/general/
-  commands:
-    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
-
- label: V1 e2e (2 GPUs)
-  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
-  optional: true
-  num_devices: 2
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
  source_file_dependencies:
    - vllm/
-    - tests/v1/e2e
+    - tests/v1
  commands:
-    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
-  mirror:
-    amd:
-      device: mi325_2
-      depends_on:
-      - image-build-amd
-
- label: V1 e2e (4 GPUs)
-  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
-  optional: true
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/
-    - tests/v1/e2e
-  commands:
-    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
-  mirror:
-    amd:
-      device: mi325_4
-      depends_on:
-      - image-build-amd
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -34,26 +34,23 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
-  - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
  - pytest -v -s tool_use

 - label: Entrypoints Integration (Pooling)
@@ -82,11 +79,6 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,25 +14,10 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
-
- label: Elastic EP Scaling Test
-  timeout_in_minutes: 20
-  device: b200
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/compilation/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_elastic_ep.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,17 +8,15 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
-  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
+  - vllm/attention
  - vllm/v1/attention
-    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -45,8 +43,7 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels Mamba Test
@@ -60,8 +57,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  device: h100
-  num_devices: 1
+  gpu: h100
+  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -72,7 +69,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -80,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -88,7 +85,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -97,7 +94,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -117,54 +114,4 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    # e2e
-    - pytest -v -s tests/models/quantization/test_nvfp4.py
-
- label: Kernels Helion Test
-  timeout_in_minutes: 30
-  device: h100
-  source_file_dependencies:
-  - vllm/utils/import_utils.py
-  - tests/kernels/helion/
-  commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
-
- 
- label: Kernels FP8 MoE Test (1 H100)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 1
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_cutlass_moe.py
-    - pytest -v -s kernels/moe/test_flashinfer.py
-    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
-    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
-    - pytest -v -s kernels/moe/test_moe.py
-    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
-    - pytest -v -s kernels/moe/test_block_int8.py
-    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
-    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
-
- label: Kernels FP8 MoE Test (2 H100s)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 2
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
-    - pytest -v -s kernels/moe/test_deepep_moe.py
-
- label: Kernels Fp4 MoE Test (B200)
-  timeout_in_minutes: 60
-  device: b200
-  num_devices: 1
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_cutedsl_moe.py
-    - pytest -v -s kernels/moe/test_flashinfer_moe.py
-    - pytest -v -s kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -11,22 +11,22 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

-# - label: LM Eval Large Models (4 GPUs)(A100)
-#   device: a100
-#   optional: true
-#   num_devices: 4
-#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-#   source_file_dependencies:
-#   - csrc/
-#   - vllm/model_executor/layers/quantization
-#   commands:
-#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,65 +37,10 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  device: b200
+  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
- label: LM Eval Large Models (H200)
-  timeout_in_minutes: 60
-  device: h200
-  optional: true
-  num_devices: 8
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
-
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
-  
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
-
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
-
- label: GPQA Eval (GPT-OSS) (H100)
-  timeout_in_minutes: 120
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
-
- label: GPQA Eval (GPT-OSS) (B200)
-  timeout_in_minutes: 120
-  device: b200
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,7 +9,6 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -17,8 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    # TODO: create another `optional` test group for slow tests
-    - pytest -v -s -m 'not slow_test' v1/spec_decode
+    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -27,19 +25,13 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: V1 Others (CPU)
-  depends_on:
-    - image-build-cpu
+  depends_on: ~
  source_file_dependencies:
    - vllm/
    - tests/v1
-  device: cpu
+  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -67,20 +59,19 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-     # for basic
-    - python3 basic/offline_inference/chat.py
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 basic/offline_inference/classify.py
-    - python3 basic/offline_inference/embed.py
-    - python3 basic/offline_inference/score.py
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -91,7 +82,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -116,48 +107,53 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
-  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
-  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: 
-  - image-build-cpu
+  depends_on: ~
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/test_ray_env.py
  - tests/multimodal
-  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  device: cpu
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
@@ -166,18 +162,4 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
-  
- label: Acceptance Length Test (Large Models) # optional
-  timeout_in_minutes: 25
-  gpu: h100
-  optional: true
-  num_gpus: 1
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/v1/spec_decode/
-  - vllm/model_executor/models/mlp_speculator.py
-  - tests/v1/spec_decode/test_acceptance_length.py
-  commands:
-    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
-    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -9,9 +9,9 @@ steps:
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
-  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -1,110 +0,0 @@
-group: Model Runner V2
-depends_on:
-  - image-build
-steps:
- label: Model Runner V2 Core Tests
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/v1/worker/gpu/
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/core/sched/
-  - vllm/v1/attention/
-  - tests/v1/engine/test_llm_engine.py
-  - tests/v1/e2e/
-  - tests/v1/entrypoints/llm/test_struct_output_generate.py
-  commands:
-  - set -x
-  - export VLLM_USE_V2_MODEL_RUNNER=1
-  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
-  # This requires eager until we sort out CG correctness issues.
-  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
-  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
-  - pytest -v -s v1/e2e/general/test_context_length.py
-  - pytest -v -s v1/e2e/general/test_min_tokens.py
-  # Temporary hack filter to exclude ngram spec decoding based tests.
-  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
-
- label: Model Runner V2 Examples
-  timeout_in_minutes: 45
-  working_dir: "/vllm-workspace/examples"
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/core/sched/
-    - vllm/v1/worker/gpu_worker.py
-    - examples/offline_inference/
-    - examples/basic/offline_inference/
-    - examples/pooling/embed/vision_embedding_offline.py
-    - examples/others/tensorize_vllm_model.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    - pip install tensorizer # for tensorizer test
-    - python3 basic/offline_inference/chat.py # for basic
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
-    #- python3 basic/offline_inference/embed.py   # TODO
-    # for multi-modal models
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
-    # for features demo
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-
- label: Model Runner V2 Distributed (2 GPUs)
-  timeout_in_minutes: 45
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/worker/gpu_worker.py
-    - tests/basic_correctness/test_basic_correctness.py
-    - tests/v1/distributed/test_async_llm_dp.py
-    - tests/v1/distributed/test_eagle_dp.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
-    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
-    # https://github.com/NVIDIA/nccl/issues/1838
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
-# These require fix https://github.com/vllm-project/vllm/pull/36280
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/worker/gpu_worker.py
-    - tests/distributed/test_pipeline_parallel.py
-    #- tests/distributed/test_pp_cudagraph.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
-    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
-    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
-
- label: Model Runner V2 Spec Decode
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/v1/worker/gpu/
-  - vllm/v1/worker/gpu_worker.py
-  - tests/v1/spec_decode/test_max_len.py
-  - tests/v1/e2e/spec_decode/test_spec_decode.py
-  commands:
-  - set -x
-  - export VLLM_USE_V2_MODEL_RUNNER=1
-  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
-  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -31,27 +33,18 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
-  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-    
+    - pytest -v -s models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  device: cpu
+  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

@@ -65,7 +58,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -37,7 +40,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -52,21 +56,13 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -76,20 +72,17 @@ steps:

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,75 +2,23 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: "Multi-Modal Models (Standard) 1: qwen2"
-  timeout_in_minutes: 45
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
-    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
-    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
-    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 4: other + whisper"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Multi-Modal Processor Test (CPU)
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  - tests/models/registry.py
-  device: cpu
+  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
@@ -80,7 +28,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -103,11 +50,6 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Multi-Modal Models (Extended) 2
  optional: true
@@ -126,3 +68,12 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
@@ -15,17 +15,10 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test
-  # test generic io_processor plugins functions
-  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
-  # test Terratorch io_processor plugins
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
-  # test bge_m3_sparse io_processor plugin
-  - pip install -e ./plugins/bge_m3_sparse_plugin
-  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
-  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -36,6 +29,6 @@ steps:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 10
+  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,16 +17,8 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

- label: PyTorch Compilation Passes Unit Tests
-  timeout_in_minutes: 20
-  source_file_dependencies:
-    - vllm/
-    - tests/compile/passes
-  commands:
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
-
 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 35
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -38,13 +30,16 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 30
+  timeout_in_minutes: 40
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -1,16 +0,0 @@
-group: Ray Compatibility
-depends_on:
-  - image-build
-steps:
- label: Ray Dependency Compatibility Check
-  # Informational only — does not block the pipeline.
-  # If this fails, it means the PR introduces a dependency that
-  # conflicts with Ray's dependency constraints.
-  # See https://github.com/vllm-project/vllm/issues/33599
-  soft_fail: true
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - requirements/
-  - setup.py
-  commands:
-  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,10 +12,3 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s samplers
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -1,40 +0,0 @@
-group: Spec Decode
-depends_on:
-  - image-build
-steps:
- label: Spec Decode Eagle
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
-
- label: Spec Decode Speculators + MTP
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - vllm/transformers_utils/configs/speculators/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
-
- label: Spec Decode Ngram + Suffix
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
-
- label: Spec Decode Draft Model
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

-# - label: Weight Loading Multiple GPU - Large Models # optional
-#   working_dir: "/vllm-workspace/tests"
-#   num_devices: 2
-#   device: a100
-#   optional: true
-#   source_file_dependencies:
-#   - vllm/
-#   - tests/weight_loading
-#   commands:
-#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+#   - "vllm/v1/attetion/*.py"
+#   - "vllm/v1/core/*.py"
+exclude:
+  - "**/*.py"
+
+scan:
+  functions: true        # check free functions and methods
+  classes: true          # check classes/dataclasses
+  public_only: true      # ignore names starting with "_" at any level
+
+annotations:
+  include:               # decorators that force‑include a symbol
+    - name: "bc_linter_include"  # matched by simple name or dotted suffix
+      propagate_to_members: false # for classes, include methods/inner classes
+  exclude:               # decorators that force‑exclude a symbol
+    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
+      propagate_to_members: true  # for classes, exclude methods/inner classes
+
+excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,66 +2,43 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
-/vllm/lora @jeejeelee
-/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
+/vllm/attention @LucasWilkinson
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
+/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @heheda12345
-
-# Entrypoints
-/vllm/entrypoints/anthropic @mgoin @DarkLight1337
-/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
-/vllm/entrypoints/mcp @heheda12345
-/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
-/vllm/entrypoints/pooling @noooop
-/vllm/entrypoints/sagemaker @DarkLight1337
-/vllm/entrypoints/serve @njhill
-/vllm/entrypoints/*.py @njhill
-/vllm/entrypoints/chat_utils.py @DarkLight1337
-/vllm/entrypoints/llm.py @DarkLight1337
-
-# Input/Output Processing
-/vllm/sampling_params.py @njhill @NickLucche
-/vllm/pooling_params.py @noooop @DarkLight1337
-/vllm/tokenizers @DarkLight1337 @njhill
-/vllm/renderers @DarkLight1337 @njhill
-/vllm/reasoning @aarnphm @chaunceyjiang
-/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1/attention @LucasWilkinson @MatthewBonanni
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/engine @njhill
-/vllm/v1/executor @njhill
-/vllm/v1/worker @njhill
-/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
+/vllm/v1/offloading @ApostaC

 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon @njhill
-/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/worker/gpu @WoosukKwon

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
@@ -77,13 +54,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC @orozery
-/tests/v1/kv_offload @ApostaC @orozery
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
 /tests/v1/determinism @yewentao256 

 # Transformers modeling backend
@@ -136,8 +113,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
-/vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -173,7 +150,9 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop

 # Security guide and policies
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,7 +3,6 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
-    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:

        ```bash 
-        uv pip install pre-commit>=4.5.1
+        uv pip install pre-commit
        pre-commit install
        pre-commit run --all-files
        ```
@@ -38,13 +37,15 @@ pull_request_rules:

        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> failing?</summary>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>

@@ -258,7 +259,8 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/structured_outputs/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
@@ -334,7 +336,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
@@ -381,7 +383,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
@@ -412,18 +414,6 @@ pull_request_rules:
      remove:
        - needs-rebase

- name: label-bug
-  description: Automatically apply bug label
-  conditions:
-    - label != stale
-    - or:
-      - title~=(?i)\bbug\b
-      - title~=(?i)\bbugfix\b
-  actions:
-    label:
-      add:
-        - bug
-
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  bc_lint:
+    if: github.repository_owner == 'vllm-project'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+          config_dir: .github
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Shengqi Chen	b17039bccc	[CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032 ) (cherry picked from commit `8e61425ee6`)	2026-01-16 21:04:48 -08:00
Cyrus Leung	48b67ba75f	[Frontend] Standardize use of `create_error_response` (#32319 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2026-01-16 11:35:10 +00:00
TJian	09f4264a55	[Bugfix] Fix ROCm dockerfiles (#32447 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>	2026-01-16 10:50:00 +08:00
Matthew Bonanni	7f42dc20bb	[CI] Fix LM Eval Large Models (H100) (#32423 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> (cherry picked from commit `bcf2333cd6`)	2026-01-15 18:00:21 -08:00
TJian	c2a37a3cf8	Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264 Signed-off-by: Kevin H. Luu <khluu000@gmail.com>	2026-01-15 17:59:58 -08:00
Michael Goin	0e31fc7996	[UX] Use kv_offloading_backend=native by default (#32421 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `1be5a73571`)	2026-01-15 17:55:20 -08:00
Pleaplusone	6ac0fcf416	[ROCm][Bugfix] Disable hip sampler to fix deepseek's accuracy issue on ROCm (#32413 ) Signed-off-by: ganyi <ygan@amd.com> (cherry picked from commit `77c16df31d`)	2026-01-15 17:55:06 -08:00
Douglas Lehr	b62249725c	[ROCM] Add ROCm image build to release pipeline (#31995 ) Signed-off-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> (cherry picked from commit `c5891b5430`)	2026-01-15 17:54:47 -08:00
vllmellm	1b57275207	[Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336 ) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> (cherry picked from commit `e27078ea80`)	2026-01-15 17:54:01 -08:00
Martin Hickey	2c24bc6996	[BugFix] [KVConnector] Fix KV events for LMCache connector (#32169 ) Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2026-01-13 10:56:23 -08:00
Cyrus Leung	0aa8c40552	[Bugfix] Replace `PoolingParams.normalize` with `use_activation` (#32243 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2026-01-13 10:56:23 -08:00