[ROCm] [Bugfix] Fix torch sdpa hallucination (#30789 )

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> (cherry picked from commit 2410132bb1)
[CI] Skip ci failure test (#30804 )
2025-12-16 17:16:25 -08:00 · 2025-12-16 17:16:08 -08:00 · 2025-12-16 17:15:49 -08:00 · 2025-12-16 17:15:26 -08:00 · 2025-12-16 17:13:54 -08:00 · 2025-12-16 17:13:23 -08:00
2573 changed files with 96003 additions and 290573 deletions
--- a/.buildkite/.pipeline_gen_v2
+++ b/.buildkite/.pipeline_gen_v2
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,8 +1,7 @@
 name: vllm_ci
 job_dirs:
-  - ".buildkite/image_build"
  - ".buildkite/test_areas"
-  - ".buildkite/hardware_tests"
+  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,30 +0,0 @@
-group: Hardware - AMD Build 
-steps:
-  - label: "AMD: :docker: build image"
-    key: image-build-amd
-    depends_on: []
-    device: amd_cpu
-    no_plugin: true
-    commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 1
-        - exit_status: -10  # Agent was lost
-          limit: 1
-        - exit_status: 1  # Machine occasionally fail
-          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Ascend NPU Test"
-    soft_fail: true
-    timeout_in_minutes: 20
-    no_plugin: true
-    device: ascend_npu
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -1,100 +0,0 @@
-group: CPU
-depends_on: []
-steps:
- label: CPU-Kernel Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - cmake/cpu_extension.cmake
-  - CMakeLists.txt
-  - vllm/_custom_ops.py
-  - tests/kernels/attention/test_cpu_attn.py
-  - tests/kernels/moe/test_cpu_fused_moe.py
-  - tests/kernels/test_onednn.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-      pytest -x -v -s tests/kernels/test_onednn.py"
-
- label: CPU-Language Generation and Pooling Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/
-  - tests/models/language/generation/
-  - tests/models/language/pooling/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
-      pytest -x -v -s tests/models/language/generation -m cpu_model
-      pytest -x -v -s tests/models/language/pooling -m cpu_model"
-
- label: CPU-Quantization Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
-  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
-  - tests/quantization/test_compressed_tensors.py
-  - tests/quantization/test_cpu_wna16.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
-      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-      
- label: CPU-Distributed Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/shm.cpp
-  - vllm/v1/worker/cpu_worker.py
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/worker/cpu_model_runner.py
-  - vllm/v1/worker/gpu_model_runner.py
-  - vllm/platforms/cpu.py
-  - vllm/distributed/parallel_state.py
-  - vllm/distributed/device_communicators/cpu_communicator.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
-
- label: CPU-Multi-Modal Model Tests %N
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  # - vllm/
-  - vllm/model_executor/layers/rotary_embedding
-  - tests/models/multimodal/generation/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
-      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
-  parallelism: 2
-
- label: "Arm CPU Test"
-  depends_on: []
-  soft_fail: true
-  device: arm_cpu
-  no_plugin: true
-  commands: 
-  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-steps:
-  - label: "GH200 Test"
-    soft_fail: true
-    device: gh200
-    no_plugin: true
-    optional: true
-    commands: 
-    - nvidia-smi 
-    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,17 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Intel HPU Test"
-    soft_fail: true
-    device: intel_hpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,255 +1,56 @@
 #!/bin/bash
-set -euo pipefail
+set -e

-# replace invalid characters in Docker image tags and truncate to 128 chars
-clean_docker_tag() {
-    local input="$1"
-    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
-}
-
-print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
-    exit 1
-}
-
-print_instance_info() {
-    echo ""
-    echo "=== Debug: Instance Information ==="
-    # Get IMDSv2 token
-    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
-        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
-        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
-        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
-        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
-        echo "AMI ID:        ${AMI_ID}"
-        echo "Instance Type: ${INSTANCE_TYPE}"
-        echo "Instance ID:   ${INSTANCE_ID}"
-        echo "AZ:            ${AZ}"
-    else
-        echo "Not running on EC2 or IMDS not available"
-    fi
-    # Check for warm cache AMI (marker file baked into custom AMI)
-    if [[ -f /etc/vllm-ami-info ]]; then
-        echo "Cache:         warm (custom vLLM AMI)"
-        cat /etc/vllm-ami-info
-    else
-        echo "Cache:         cold (standard AMI)"
-    fi
-    echo "==================================="
-    echo ""
-}
-
-setup_buildx_builder() {
-    echo "--- :buildkite: Setting up buildx builder"
-    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
-        # Custom AMI with standalone buildkitd - use remote driver for warm cache
-        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
-        echo "Using remote driver to connect to buildkitd (warm cache available)"
-        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
-            echo "Using existing baked-vllm-builder"
-            docker buildx use baked-vllm-builder
-        else
-            echo "Creating baked-vllm-builder with remote driver"
-            docker buildx create \
-                --name baked-vllm-builder \
-                --driver remote \
-                --use \
-                "unix://${BUILDKIT_SOCKET}"
-        fi
-        docker buildx inspect --bootstrap
-    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
-        # Existing builder available
-        echo "Using existing builder: ${BUILDER_NAME}"
-        docker buildx use "${BUILDER_NAME}"
-        docker buildx inspect --bootstrap
-    else
-        # No local buildkitd, no existing builder - create new docker-container builder
-        echo "No local buildkitd found, using docker-container driver"
-        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
-        docker buildx inspect --bootstrap
-    fi
-
-    # builder info
-    echo "Active builder:"
-    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
-}
-
-check_and_skip_if_image_exists() {
-    if [[ -n "${IMAGE_TAG:-}" ]]; then
-        echo "--- :mag: Checking if image exists"
-        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
-            echo "Image already exists: ${IMAGE_TAG}"
-            echo "Skipping build"
-            exit 0
-        fi
-        echo "Image not found, proceeding with build"
-    fi
-}
-
-ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
-}
-
-prepare_cache_tags() {
-    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
-    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
-    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
-
-    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-            cache="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
-            cache="${TEST_CACHE_ECR}:${clean_branch}"
-        fi
-        CACHE_TO="$cache"
-        CACHE_FROM="$cache"
-        CACHE_FROM_BASE_BRANCH="$cache"
-    else
-        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
-            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
-            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
-        fi
-    fi
-
-    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
-    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
-}
-
-resolve_parent_commit() {
-    if [[ -z "${PARENT_COMMIT:-}" ]]; then
-        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
-        if [[ -n "${PARENT_COMMIT}" ]]; then
-            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
-            export PARENT_COMMIT
-        else
-            echo "Could not determine parent commit (may be first commit in repo)"
-        fi
-    else
-        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
-    fi
-}
-
-print_bake_config() {
-    echo "--- :page_facing_up: Resolved bake configuration"
-    # Write to a temp directory to avoid polluting the repo root (which is the
-    # Docker build context). Files left in the repo root get COPY'd into the
-    # image and can cause duplicate artifact uploads from downstream steps.
-    local bake_tmp
-    bake_tmp="$(mktemp -d)"
-    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
-    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
-    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
-    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
-}
-
-#################################
-#         Main Script           #
-#################################
-print_instance_info
-
-if [[ $# -lt 5 ]]; then
-    print_usage_and_exit
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
 fi

-# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-IMAGE_TAG=$5
-IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8

-# build config
-TARGET="test-ci"
-VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
-BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
-CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
-CI_HCL_PATH="/tmp/ci.hcl"
-BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com

-prepare_cache_tags
-ecr_login
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls

-# Environment info (for docs and human readers)
-#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
-#
-# Build configuration (exported as environment variables for bake):
-export BUILDKITE_COMMIT
-export PARENT_COMMIT
-export IMAGE_TAG
-export IMAGE_TAG_LATEST
-export CACHE_FROM
-export CACHE_FROM_BASE_BRANCH
-export CACHE_FROM_MAIN
-export CACHE_TO
-
-# print args
-echo "--- :mag: Arguments"
-echo "REGISTRY: ${REGISTRY}"
-echo "REPO: ${REPO}"
-echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
-echo "BRANCH: ${BRANCH}"
-echo "IMAGE_TAG: ${IMAGE_TAG}"
-echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
-
-# print build configuration
-echo "--- :mag: Build configuration"
-echo "TARGET: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "BUILDER_NAME: ${BUILDER_NAME}"
-echo "CI_HCL_URL: ${CI_HCL_URL}"
-echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
-
-echo "--- :mag: Cache tags"
-echo "CACHE_TO: ${CACHE_TO}"
-echo "CACHE_FROM: ${CACHE_FROM}"
-echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
-echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
-
-check_and_skip_if_image_exists
-
-echo "--- :docker: Setting up Docker buildx bake"
-echo "Target: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "CI HCL path: ${CI_HCL_PATH}"
-
-if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
-    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
-    echo "Make sure you're running from the vLLM repository root"
-    exit 1
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
 fi

-echo "--- :arrow_down: Downloading ci.hcl"
-curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-echo "Downloaded to ${CI_HCL_PATH}"
-
-if [[ ! -f "${CI_HCL_PATH}" ]]; then
-    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
-    exit 1
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
 fi

-setup_buildx_builder
-
-resolve_parent_commit
-export PARENT_COMMIT
-
-print_bake_config
-
-echo "--- :docker: Building ${TARGET}"
-docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-
-echo "--- :white_check_mark: Build complete"
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,9 +3,8 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
-    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -41,7 +40,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-
+  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -1,15 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.695
-  - name: "exact_match,flexible-extract"
-    value: 0.447
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -1,19 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.7142
-  - name: "exact_match,flexible-extract"
-    value: 0.4579
-env_vars:
-  VLLM_USE_FLASHINFER_MOE_FP8: "1"
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-kv_cache_dtype: fp8
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1,2 +1 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +0,0 @@
-Qwen2.5-1.5B-Instruct.yaml
-Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install lm-eval==0.4.9

 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit "$LIMIT"
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]

 usage() {
    echo``
@@ -20,11 +20,14 @@ usage() {
    echo
 }

-while getopts "m:l:f:t:" OPT; do
+while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -60,7 +60,6 @@ def launch_lm_eval(eval_config, tp_size):
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
-        "allow_deprecated_quantization=True,"
    )

    env_vars = eval_config.get("env_vars", None)
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.

 **Benchmarking Duration**: about 1hr.

@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

 Runtime environment variables:

- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,9 +34,8 @@ Runtime environment variables:

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
-> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
-
+For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+>
 ### Latency test

 Here is an example of one test inside `latency-tests.json`:
@@ -176,6 +175,19 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

-#### Performance Results Comparison  
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.

-Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
+|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            "performance-benchmarks-descriptions.md"
+            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+
+# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -7,19 +9,14 @@
 set -x
 set -o pipefail

-# Environment-driven debug controls (like ON_CPU=1)
-DRY_RUN="${DRY_RUN:-0}"
-MODEL_FILTER="${MODEL_FILTER:-}"
-DTYPE_FILTER="${DTYPE_FILTER:-}"
-
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -28,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-
+  
  declare -g arch_suffix=''
-
+  
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -47,16 +44,12 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo "$numa_count"
+    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
-  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
-    declare -g gpu_type="arm64-cpu"
-  else
-    declare -g gpu_type="cpu"
-  fi
+  declare -g gpu_type="cpu"
  echo "GPU type is $gpu_type"
 }

@@ -115,12 +108,13 @@ json2envs() {
 }

 wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -sf http://localhost:8000/v1/models >/dev/null; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
-    done
-  '
+    done' && return 0 || return 1
 }

 kill_processes_launched_by_current_bash() {
@@ -183,20 +177,19 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-run_benchmark_tests() {
-  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: test type (latency or throughput)
-  # $2: a json file specifying test cases
+run_latency_tests() {
+  # run latency tests using `vllm bench latency` command
+  # $1: a json file specifying latency test cases

-  local test_type=$1
-  local test_file=$2
+  local latency_test_file
+  latency_test_file=$1

-  # Iterate over tests
-  jq -c '.[]' "$test_file" | while read -r params; do
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi

@@ -207,15 +200,15 @@ run_benchmark_tests() {
    fi

    # get arguments
-    bench_params=$(echo "$params" | jq -r '.parameters')
-    bench_args=$(json2args "$bench_params")
-    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    bench_envs=$(json2envs "$bench_environment_variables")
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_envs=$(json2envs "$latency_environment_variables")

    # check if there is enough GPU to run the test
-    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -228,42 +221,118 @@ run_benchmark_tests() {
      fi
    fi

-    bench_command=" $bench_envs vllm bench $test_type \
+    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $bench_args"
+      $latency_args"

    echo "Running test case $test_name"
-    echo "${test_type^} command: $bench_command"
+    echo "Latency command: $latency_command"

-    # recording benchmarking command and GPU command
+    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
-      --arg command "$bench_command" \
+      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
-      --arg test_type "$test_type" \
      '{
-        ($test_type + "_command"): $command,
+        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
-    eval "$bench_command"
+    eval "$latency_command"

    kill_gpu_processes

  done
 }

-run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
+run_throughput_tests() {
+  # run throughput tests using `vllm bench throughput`
+  # $1: a json file specifying throughput test cases

-merge_serving_tests_stream() {
-  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-  local serving_test_file="$1"
-  # shellcheck disable=SC2016
-  local merged='
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    throughput_envs=$(json2envs "$throughput_environment_variables")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    throughput_command=" $throughput_envs vllm bench throughput \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -285,50 +354,7 @@ merge_serving_tests_stream() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  '
-
-  jq -c "$merged" "$serving_test_file" | \
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
-      select((($model|length)==0)
-             or ((.server_parameters.model // "") == $model)
-             or ((.client_parameters.model // "") == $model))
-      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
-    '
-  else
-    cat
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # In dry-run mode, if filters are provided but no tests match, fail fast.
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    local count
-    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
-    if [[ "$count" -eq 0 ]]; then
-      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
-      return 0
-    fi
-  fi
-
-  # Iterate over serving tests (merged + optional filtered stream)
-  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -367,8 +393,8 @@ run_serving_tests() {

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -397,7 +423,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
+    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -408,9 +434,6 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
-    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
-        # dry-run: don't start server
-        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -420,39 +443,34 @@ run_serving_tests() {
      fi
    fi

-    # save the compilation mode and optimization level on the serving results
-    # whenever they are set
-    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
-    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
-
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
        qps="inf"
+        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size, the compilation mode, and the optimization
-        # level to the client so that they can be used on the benchmark dashboard
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
+          --metadata "tensor_parallel_size=$tp" \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        if [[ "${DRY_RUN:-0}" != "1" ]]; then
-          bash -c "$client_command"
-        fi
+        bash -c "$client_command"

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -470,31 +488,22 @@ run_serving_tests() {
    done

    # clean up
-    if [[ "${DRY_RUN:-0}" != "1" ]]; then
-      kill -9 "$server_pid"
-      kill_gpu_processes
-    fi
+    kill -9 $server_pid
+    kill_gpu_processes
  done
 }

 main() {
-
  local ARCH
  ARCH=''
-  if [[ "$ON_CPU" == "1" ]]; then
-    check_cpus
-    ARCH="-$gpu_type"
+  if [ "$ON_CPU" == "1" ];then
+     check_cpus
+     ARCH='-cpu'
  else
     check_gpus
     ARCH="$arch_suffix"
  fi
-
-  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    check_hf_token
-  else
-    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
-  fi
+  check_hf_token

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -515,18 +524,12 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
+
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
-
-  if [[ "${DRY_RUN:-0}" == "1" ]]; then
-    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
-    exit 0
-  fi
-
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
-  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

  # postprocess benchmarking results
--- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -1,26 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 128,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "enforce_eager": "",
-            "max_num_batched_tokens": 2048,
-            "max_num_seqs": 256,
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,56 +51,5 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
-    },
-    {
-        "test_name": "latency_deepseek_r1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "dtype": "bfloat16"
-        }
-    },
-    {
-        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "max-model-len": 512,
-            "max-num-seqs": 128,
-            "async-scheduling": "",
-            "gpu-memory-utilization": 0.95,
-            "enable_expert_parallel": ""
-        }
-    },
-    {
-        "test_name": "latency_qwen3_8b",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "Qwen/Qwen3-8B",
-            "tensor_parallel_size": 1,
-            "max-model-len": 2048,
-            "max-num-seqs": 128,
-            "dtype": "bfloat16",
-            "async-scheduling": ""
-        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -1,130 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      12,
-      16,
-      24,
-      32,
-      64,
-      128,
-      200
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "enforce_eager": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256,
-      "load_format": "dummy"
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -1,41 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      32,
-      64,
-      128
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "jinaai/jina-embeddings-v3",
-      "trust_remote_code": ""
-    },
-    "client_parameters": {
-      "model": "jinaai/jina-embeddings-v3",
-      "backend": "openai-embeddings",
-      "endpoint": "/v1/embeddings",
-      "dataset_name": "sharegpt",
-      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -1,283 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -19,8 +19,10 @@
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
+      "enforce_eager": "",
      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
+      "max_num_seqs": 256,
+      "load_format": "dummy"
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
@@ -148,6 +150,97 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -78,84 +78,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
-    },
-    {
-        "test_name": "serving_deepseek_r1",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "max-num-seqs": 200,
-            "async-scheduling": "",
-            "dtype": "bfloat16"
-        },
-        "client_parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "disable_log_stats": "",
-            "max-model-len": 2048,
-            "max-num-seqs": 128,
-            "async-scheduling": "",
-            "enable_expert_parallel": "",
-            "max-num-batched-tokens": 4096
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_qwen3_8b",
-        "qps_list": [1, 4, 10, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "tensor_parallel_size": 1,
-            "dtype": "bfloat16",
-            "disable_log_stats": "",
-            "async-scheduling": ""
-        },
-        "client_parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -1,27 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 128,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "enforce_eager": "",
-            "max_num_batched_tokens": 2048,
-            "max_num_seqs": 256,
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,67 +57,5 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
-    },
-    {
-        "test_name": "throughput_deepseek_r1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "deepseek-ai/DeepSeek-R1",
-            "tensor_parallel_size": 8,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 384,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "tensor_parallel_size": 8,
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 512,
-            "async-scheduling": "",
-            "enable_expert_parallel": ""
-        }
-    },
-    {
-        "test_name": "throughput_qwen3_8b",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "Qwen/Qwen-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "dataset_name": "sharegpt",
-            "num_prompts": 1000,
-            "max-num-seqs": 512,
-            "backend": "vllm",
-            "async-scheduling": ""
-        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,713 +1,198 @@
 steps:
+  # aarch64 + CUDA builds
+  - label: "Build arm64 wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-12-9
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build arm64 wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # aarch64 build
+  - label: "Build arm64 CPU wheel"
+    depends_on: ~
+    id: build-wheel-arm64-cpu
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-cuda-12-9
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 CPU wheel build
+  - label: "Build x86 CPU wheel"
+    depends_on: ~
+    id: build-wheel-x86-cpu
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Build release images (12.9)
+  - label: "Build release image (x86)"
+    depends_on: ~
+    id: build-release-image-x86
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build release image (arm64)"
+    depends_on: ~
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  # Add job to create multi-arch manifest
+  - label: "Create multi-arch manifest"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Annotate release workflow"
+    depends_on:
+      - create-multi-arch-manifest
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

-  - group: "Build Python wheels"
-    key: "build-wheels"
-    steps:
-      - label: "Build wheel - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-12-9
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - aarch64 - CPU"
-        depends_on: ~
-        id: build-wheel-arm64-cpu
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-x86-cuda-12-9
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CPU"
-        depends_on: ~
-        id: build-wheel-x86-cpu
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-  - group: "Build release Docker images"
-    key: "build-release-images"
-    steps:
-      - label: "Build release image - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-x86
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-      - label: "Build release image - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-arm64
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-      - label: "Build release image - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Build release image - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-
-      - block: "Build release image for x86_64 CPU"
-        key: block-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - x86_64 - CPU"
-        depends_on:
-          - block-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - block: "Build release image for arm64 CPU"
-        key: block-arm64-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - arm64 - CPU"
-        depends_on: 
-          - block-arm64-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-  - group: "Publish release images"
-    key: "publish-release-images"
-    steps:
-      - label: "Create multi-arch manifest - CUDA 12.9"
-        depends_on:
-          - build-release-image-x86
-          - build-release-image-arm64
-        id: create-multi-arch-manifest
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-      - label: "Annotate release workflow - CUDA 12.9"
-        depends_on:
-          - create-multi-arch-manifest
-        id: annotate-release-workflow
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/annotate-release.sh"
-
-      - label: "Create multi-arch manifest - CUDA 13.0"
-        depends_on:
-          - build-release-image-x86-cuda-13-0
-          - build-release-image-arm64-cuda-13-0
-        id: create-multi-arch-manifest-cuda-13-0
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Publish nightly multi-arch image to DockerHub"
-        depends_on:
-          - create-multi-arch-manifest
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
-        depends_on:
-          - create-multi-arch-manifest-cuda-13-0
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-  - group: "Publish wheels"
-    key: "publish-wheels"
-    steps:
-      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
-        key: block-upload-release-wheels
-        depends_on:
-          - input-release-version
-          - build-wheels
-
-      - label: "Upload release wheels to PyPI"
-        depends_on:
-          - block-upload-release-wheels
-        id: upload-release-wheels
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
-
-  # =============================================================================
-  # ROCm Release Pipeline (x86_64 only)
-  # =============================================================================
-  #
-  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
-  # To build a specific version, trigger the build from that branch/tag.
-  #
-  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
-  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
-  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
-  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
-  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
-  #
-  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
-  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
-  #
-  # =============================================================================
-
-  # ROCm Input Step - Collect build configuration (manual trigger only)
-  - input: "ROCm Wheel Release Build Configuration"
-    key: input-rocm-config
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
    depends_on: ~
-    if: build.source == "ui"
-    fields:
-      - text: "Python Version"
-        key: "rocm-python-version"
-        default: "3.12"
-        hint: "Python version (e.g., 3.12)"
-      - text: "GPU Architectures"
-        key: "rocm-pytorch-rocm-arch"
-        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
-        hint: "Semicolon-separated GPU architectures"
-      - select: "Upload Wheels to S3"
-        key: "rocm-upload-wheels"
-        default: "true"
-        options:
-          - label: "No - Build only (nightly/dev)"
-            value: "false"
-          - label: "Yes - Upload to S3 (release)"
-            value: "true"
-      - select: "Force Rebuild Base Wheels"
-        key: "rocm-force-rebuild"
-        default: "false"
-        hint: "Ignore S3 cache and rebuild base wheels from scratch"
-        options:
-          - label: "No - Use cached wheels if available"
-            value: "false"
-          - label: "Yes - Rebuild even if cache exists"
-            value: "true"

-  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
-  - label: ":rocm: Build ROCm Base Wheels"
-    id: build-rocm-base-wheels
-    depends_on:
-      - step: input-rocm-config
-        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
-      # Set configuration and check cache
-      - |
-        set -euo pipefail
-
-        # Get values from meta-data (set by input step) or use defaults
-        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
-        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
-
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        # Check for force rebuild flag
-        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
-        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
-          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
-        fi
-
-        echo "========================================"
-        echo "ROCm Base Wheels Build Configuration"
-        echo "========================================"
-        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
-        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
-        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
-        echo "========================================"
-
-        # Save resolved config for later jobs
-        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
-        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
-
-        # Check S3 cache for pre-built wheels
-        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
-        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
-        echo ""
-        echo "Cache key: $${CACHE_KEY}"
-        echo "Cache path: $${CACHE_PATH}"
-
-        # Save cache key for downstream jobs
-        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
-
-        CACHE_STATUS="miss"
-        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
-          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
-        else
-          echo "Force rebuild requested, skipping cache check"
-        fi
-
-        if [ "$${CACHE_STATUS}" = "hit" ]; then
-          echo ""
-          echo "CACHE HIT! Downloading pre-built wheels..."
-          echo ""
-          .buildkite/scripts/cache-rocm-base-wheels.sh download
-
-          # Set the S3 path for the cached Docker image (for Job 2 to download)
-          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
-          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Mark that we used cache (for Docker image handling)
-          buildkite-agent meta-data set "rocm-used-cache" "true"
-
-          echo ""
-          echo "Cache download complete. Skipping Docker build."
-          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-        else
-          echo ""
-          echo "CACHE MISS. Building from scratch..."
-          echo ""
-
-          # Build full base image (for later vLLM build)
-          DOCKER_BUILDKIT=1 docker buildx build \
-            --file docker/Dockerfile.rocm_base \
-            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
-            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
-            --build-arg USE_SCCACHE=1 \
-            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-            --build-arg SCCACHE_REGION_NAME=us-west-2 \
-            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-            --load \
-            .
-
-          # Build debs_wheel_release stage for wheel extraction
-          DOCKER_BUILDKIT=1 docker buildx build \
-            --file docker/Dockerfile.rocm_base \
-            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
-            --target debs_wheel_release \
-            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
-            --build-arg USE_SCCACHE=1 \
-            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-            --build-arg SCCACHE_REGION_NAME=us-west-2 \
-            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-            --load \
-            .
-
-          # Extract wheels from Docker image
-          mkdir -p artifacts/rocm-base-wheels
-          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
-          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
-          docker rm $${container_id}
-          echo "Extracted base wheels:"
-          ls -lh artifacts/rocm-base-wheels/
-
-          # Upload wheels to S3 cache for future builds
-          echo ""
-          echo "Uploading wheels to S3 cache..."
-          .buildkite/scripts/cache-rocm-base-wheels.sh upload
-
-          # Export base Docker image for reuse in vLLM build
-          mkdir -p artifacts/rocm-docker-image
-          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
-          echo "Docker image size:"
-          ls -lh artifacts/rocm-docker-image/
-
-          # Upload large Docker image to S3 (also cached by cache key)
-          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
-          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
-          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Save the S3 path for downstream jobs
-          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Mark that we did NOT use cache
-          buildkite-agent meta-data set "rocm-used-cache" "false"
-
-          echo ""
-          echo "Build complete. Wheels cached for future builds."
-        fi
-    artifact_paths:
-      - "artifacts/rocm-base-wheels/*.whl"
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"

-  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel - x86_64"
-    id: build-rocm-vllm-wheel
-    depends_on:
-      - step: build-rocm-base-wheels
-        allow_failure: false
+  - block: "Build arm64 CPU release image"
+    key: block-arm64-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish arm64 CPU release image"
+    depends_on: block-arm64-cpu-release-image-build
    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 180
+      queue: arm64_cpu_queue_postmerge
    commands:
-      # Download artifacts and prepare Docker image
-      - |
-        set -euo pipefail
-
-        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
-        # This fixes version detection when tags are moved/force-pushed
-        echo "Fetching latest tags from origin..."
-        git fetch --tags --force origin
-        
-        # Log tag information for debugging version detection
-        echo "========================================"
-        echo "Git Tag Verification"
-        echo "========================================"
-        echo "Current HEAD: $(git rev-parse HEAD)"
-        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
-        echo ""
-        echo "Recent tags (pointing to commits near HEAD):"
-        git tag -l --sort=-creatordate | head -5
-        echo "setuptools_scm version detection:"
-        pip install -q setuptools_scm 2>/dev/null || true
-        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
-        echo "========================================"
-
-        # Download wheel artifacts from current build
-        echo "Downloading wheel artifacts from current build"
-        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
-
-        # Download Docker image from S3 (too large for Buildkite artifacts)
-        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
-        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
-          echo "ERROR: rocm-docker-image-s3-path metadata not found"
-          echo "This should have been set by the build-rocm-base-wheels job"
-          exit 1
-        fi
-        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
-        mkdir -p artifacts/rocm-docker-image
-        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
-
-        # Load base Docker image and capture the tag
-        echo "Loading base Docker image..."
-        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
-        echo "$${LOAD_OUTPUT}"
-        # Extract the actual loaded image tag from "Loaded image: <tag>" output
-        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
-        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
-        if [ -z "$${BASE_IMAGE_TAG}" ]; then
-          echo "ERROR: Failed to extract image tag from docker load output"
-          echo "Load output was: $${LOAD_OUTPUT}"
-          exit 1
-        fi
-        echo "Loaded base image: $${BASE_IMAGE_TAG}"
-
-        # Prepare base wheels for Docker build context
-        mkdir -p docker/context/base-wheels
-        touch docker/context/base-wheels/.keep
-        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
-        echo "Base wheels for vLLM build:"
-        ls -lh docker/context/base-wheels/
-
-        # Get GPU architectures from meta-data
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        echo "========================================"
-        echo "Building vLLM wheel with:"
-        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
-        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
-        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
-        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
-        echo "========================================"
-
-        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
-        DOCKER_BUILDKIT=1 docker build \
-          --file docker/Dockerfile.rocm \
-          --target export_vllm_wheel_release \
-          --output type=local,dest=rocm-dist \
-          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
-          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-          --build-arg REMOTE_VLLM=0 \
-          --build-arg GIT_REPO_CHECK=1 \
-          --build-arg USE_SCCACHE=1 \
-          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-          --build-arg SCCACHE_REGION_NAME=us-west-2 \
-          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-          .
-
-        echo "Built vLLM wheel:"
-        ls -lh rocm-dist/*.whl
-
-        # Copy wheel to artifacts directory
-        mkdir -p artifacts/rocm-vllm-wheel
-        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
-        echo "Final vLLM wheel:"
-        ls -lh artifacts/rocm-vllm-wheel/
-    artifact_paths:
-      - "artifacts/rocm-vllm-wheel/*.whl"
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"

-  # ROCm Job 3: Upload Wheels to S3
-  - label: ":s3: Upload ROCm Wheels to S3"
-    id: upload-rocm-wheels
+  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
-      - step: build-rocm-vllm-wheel
-        allow_failure: false
+      - create-multi-arch-manifest
+    if: build.env("NIGHTLY") == "1"
    agents:
      queue: cpu_queue_postmerge
-    timeout_in_minutes: 60
    commands:
-      # Download all wheel artifacts and run upload
-      - |
-        set -euo pipefail
-
-        # Check if upload is enabled (from env var, meta-data, or release branch)
-        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
-        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
-          # Try to get from meta-data (input form)
-          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
-        fi
-
-        echo "========================================"
-        echo "Upload check:"
-        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
-        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
-        echo "========================================"
-
-        # Skip upload if not enabled
-        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
-          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
-          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
-          exit 0
-        fi
-
-        echo "Upload enabled, proceeding..."
-
-        # Download artifacts from current build
-        echo "Downloading artifacts from current build"
-        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
-        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
-
-        # Run upload script
-        bash .buildkite/scripts/upload-rocm-wheels.sh
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-aarch64"
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest push vllm/vllm-openai:nightly"
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      # Clean up old nightly builds (keep only last 14)
+      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllmbot
+          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 4: Annotate ROCm Wheel Release
-  - label: ":memo: Annotate ROCm wheel release"
-    id: annotate-rocm-release
-    depends_on:
-      - step: upload-rocm-wheels
-        allow_failure: true
-      - step: input-release-version
-        allow_failure: true
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-rocm-release.sh"
-    env:
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
-  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
-  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
-  - block: "Generate Root Index for ROCm Wheels for Release"
-    key: block-generate-root-index-rocm-wheels
-    depends_on: upload-rocm-wheels
-
-  - label: ":package: Generate Root Index for ROCm Wheels for Release"
-    depends_on: block-generate-root-index-rocm-wheels
-    id: generate-root-index-rocm-wheels
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
-    env:
-      S3_BUCKET: "vllm-wheels"
-      VARIANT: "rocm700"
-
-  # ROCm Job 5: Build ROCm Release Docker Image
-  - label: ":docker: Build release image - x86_64 - ROCm"
-    id: build-rocm-release-image
-    depends_on:
-      - step: build-rocm-base-wheels
-        allow_failure: false
-    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 60
-    commands:
-      - |
-        set -euo pipefail
-
-        # Login to ECR
-        aws ecr-public get-login-password --region us-east-1 | \
-          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-
-        # Download Docker image from S3 (set by build-rocm-base-wheels)
-        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
-        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
-          echo "ERROR: rocm-docker-image-s3-path metadata not found"
-          exit 1
-        fi
-
-        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
-        mkdir -p artifacts/rocm-docker-image
-        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
-
-        # Load base Docker image
-        echo "Loading base Docker image..."
-        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
-        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
-        echo "Loaded base image: $${BASE_IMAGE_TAG}"
-
-        # Tag and push the base image to ECR
-        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
-
-        # Get GPU architectures from meta-data
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        # Build vLLM ROCm release image using cached base
-        DOCKER_BUILDKIT=1 docker build \
-          --build-arg max_jobs=16 \
-          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
-          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-          --build-arg USE_SCCACHE=1 \
-          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-          --build-arg SCCACHE_REGION_NAME=us-west-2 \
-          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
-          --target vllm-openai \
-          --progress plain \
-          -f docker/Dockerfile.rocm .
-
-        # Push to ECR
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
-        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
-    env:
-      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
+      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,36 +11,27 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .

-(Optional) For CUDA 13.0:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
-
-(Optional) For CPU:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`

+To download the wheel (by version):
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
+\`\`\`

 To download and upload the image:

 \`\`\`
-# Download images:
-
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-
-# Tag and push images:
-
-## CUDA

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -48,70 +39,16 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
-docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
-docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
-docker push vllm/vllm-openai:latest-x86_64-cu130
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
-
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker push vllm/vllm-openai:latest-aarch64-cu130
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-
-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
-
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu130
-docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker manifest push vllm/vllm-openai:latest-cu130
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF
+EOF 
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -1,112 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Generate Buildkite annotation for ROCm wheel release
-set -ex
-
-# Get build configuration from meta-data
-# Extract ROCm version dynamically from Dockerfile.rocm_base
-# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
-ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
-PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
-PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
-
-# TODO: Enable the nightly build for ROCm
-# Get release version, default to 1.0.0.dev for nightly/per-commit builds
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
-if [ -z "${RELEASE_VERSION}" ]; then
-  RELEASE_VERSION="1.0.0.dev"
-fi
-
-# S3 URLs
-S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
-S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
-S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
-
-# Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
-ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
-buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
-## ROCm Wheel and Docker Image Releases
-### Build Configuration
-| Setting | Value |
-|---------|-------|
-| **ROCm Version** | ${ROCM_VERSION} |
-| **Python Version** | ${PYTHON_VERSION} |
-| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
-| **Branch** | \`${BUILDKITE_BRANCH}\` |
-| **Commit** | \`${BUILDKITE_COMMIT}\` |
-
-### :package: Installation
-
-**Install from this build (by commit):**
-
-\`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
-
-# Example for ROCm ${ROCM_VERSION}:
-pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
-\`\`\`
-
-**Install from nightly (if published):**
-
-\`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
-\`\`\`
-
-### :floppy_disk: Download Wheels Directly
-
-\`\`\`bash
-# List all ROCm wheels
-aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
-# Download specific wheels
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
-\`\`\`
-
-### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton
- **triton-kernels**: Triton kernels
- **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
-
-### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
-
-### :package: Docker Image Release
-
-To download and upload the image:
-
-\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-\`\`\`
-
-EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -1,140 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Cache helper for ROCm base wheels
-#
-# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
-# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
-#
-# Usage:
-#   cache-rocm-base-wheels.sh check    - Check if cache exists, outputs "hit" or "miss"
-#   cache-rocm-base-wheels.sh upload   - Upload wheels to cache
-#   cache-rocm-base-wheels.sh download - Download wheels from cache
-#   cache-rocm-base-wheels.sh key      - Output the cache key
-#
-# Environment variables:
-#   S3_BUCKET          - S3 bucket name (default: vllm-wheels)
-#   PYTHON_VERSION     - Python version (affects cache key)
-#   PYTORCH_ROCM_ARCH  - GPU architectures (affects cache key)
-#
-# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
-#       so changes to ROCm version are captured by the Dockerfile hash.
-
-set -euo pipefail
-
-BUCKET="${S3_BUCKET:-vllm-wheels}"
-DOCKERFILE="docker/Dockerfile.rocm_base"
-CACHE_PREFIX="rocm/cache"
-
-# Generate hash from Dockerfile content + build args
-generate_cache_key() {
-    # Include Dockerfile content
-    if [[ ! -f "$DOCKERFILE" ]]; then
-        echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
-        exit 1
-    fi
-    local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
-
-    # Include key build args that affect the output
-    # These should match the ARGs in Dockerfile.rocm_base that change the build output
-    # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
-    local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
-    local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
-
-    echo "${dockerfile_hash}-${args_hash}"
-}
-
-CACHE_KEY=$(generate_cache_key)
-CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
-
-case "${1:-}" in
-    check)
-        echo "Checking cache for key: ${CACHE_KEY}" >&2
-        echo "Cache path: ${CACHE_PATH}" >&2
-        echo "Variables used in cache key:" >&2
-        echo "  PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
-        echo "  PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
-
-        # Check if cache exists by listing objects
-        # We look for at least one .whl file
-        echo "Running: aws s3 ls ${CACHE_PATH}" >&2
-        S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
-        echo "S3 ls output:" >&2
-        echo "$S3_OUTPUT" | head -5 >&2
-
-        if echo "$S3_OUTPUT" | grep -q "\.whl"; then
-            echo "hit"
-        else
-            echo "miss"
-        fi
-        ;;
-
-    upload)
-        echo "========================================"
-        echo "Uploading wheels to cache"
-        echo "========================================"
-        echo "Cache key: ${CACHE_KEY}"
-        echo "Cache path: ${CACHE_PATH}"
-        echo ""
-
-        if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
-            echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
-            exit 1
-        fi
-
-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
-        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
-            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
-            exit 1
-        fi
-
-        echo "Uploading $WHEEL_COUNT wheels..."
-        aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
-
-        echo ""
-        echo "Cache upload complete!"
-        echo "========================================"
-        ;;
-
-    download)
-        echo "========================================"
-        echo "Downloading wheels from cache"
-        echo "========================================"
-        echo "Cache key: ${CACHE_KEY}"
-        echo "Cache path: ${CACHE_PATH}"
-        echo ""
-
-        mkdir -p artifacts/rocm-base-wheels
-        aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
-
-        echo ""
-        echo "Downloaded wheels:"
-        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
-
-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
-        echo ""
-        echo "Total: $WHEEL_COUNT wheels"
-        echo "========================================"
-        ;;
-
-    key)
-        echo "${CACHE_KEY}"
-        ;;
-
-    path)
-        echo "${CACHE_PATH}"
-        ;;
-
-    *)
-        echo "Usage: $0 {check|upload|download|key|path}" >&2
-        echo "" >&2
-        echo "Commands:" >&2
-        echo "  check    - Check if cache exists, outputs 'hit' or 'miss'" >&2
-        echo "  upload   - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
-        echo "  download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
-        echo "  key      - Output the cache key" >&2
-        echo "  path     - Output the full S3 cache path" >&2
-        exit 1
-        ;;
-esac
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -1,205 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Check if Ray LLM can generate lock files that are compatible with this
-# version of vllm. Downloads Ray's requirement files and runs a full
-# dependency resolution with the installed vllm's constraints to see if
-# a valid lock file can be produced.
-#
-# See: https://github.com/vllm-project/vllm/issues/33599
-
-set -eo pipefail
-
-RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
-
-WORK_DIR=$(mktemp -d)
-trap 'rm -rf "$WORK_DIR"' EXIT
-
-# Fetch all Ray requirement files used in the LLM depset pipeline
-echo ">>> Fetching Ray requirement files"
-RAY_FILES=(
-    "requirements.txt"
-    "requirements/cloud-requirements.txt"
-    "requirements/base-test-requirements.txt"
-    "requirements/llm/llm-requirements.txt"
-    "requirements/llm/llm-test-requirements.txt"
-)
-for FILE in "${RAY_FILES[@]}"; do
-    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
-    echo "    ${FILE}"
-    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
-done
-
-# Extract installed vllm deps
-echo ">>> Extracting installed vllm dependency constraints"
-python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
-"""Write out the installed vllm's dependencies as pip constraint lines.
-
-Ray uses vllm[audio], so audio-extra deps are included with their extra
-markers stripped. The resolver cannot evaluate extra markers for a
-package that is not itself being resolved from an index, so we activate
-them manually here.
-"""
-import importlib.metadata
-import re
-import sys
-
-out_path = sys.argv[1]
-raw_reqs = importlib.metadata.requires("vllm") or []
-
-# Ray uses vllm[audio] – activate that extra.
-ACTIVE_EXTRAS = {"audio"}
-EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
-
-lines = []
-for r in raw_reqs:
-    if ";" not in r:
-        # Unconditional dep — always include.
-        lines.append(r.strip())
-        continue
-
-    req_part, _, marker_part = r.partition(";")
-    marker_part = marker_part.strip()
-
-    extra_matches = EXTRA_RE.findall(marker_part)
-    if not extra_matches:
-        # Non-extra marker (python_version, etc.) — keep as-is.
-        lines.append(r.strip())
-        continue
-
-    if not ACTIVE_EXTRAS.intersection(extra_matches):
-        continue  # Skip inactive extras (tensorizer, bench, …).
-
-    # Strip the extra== conditions but keep any remaining markers
-    # (e.g. python_version).
-    cleaned = EXTRA_RE.sub("", marker_part)
-    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
-    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
-
-    if cleaned:
-        lines.append(f"{req_part.strip()} ; {cleaned}")
-    else:
-        lines.append(req_part.strip())
-
-with open(out_path, "w") as f:
-    for line in lines:
-        f.write(line + "\n")
-
-print(f"Wrote {len(lines)} constraints to {out_path}")
-PYEOF
-
-echo ">>> Installed vllm deps (first 20 lines):"
-head -20 "${WORK_DIR}/vllm-constraints.txt"
-
-# Remove Ray's vllm pin — the installed vllm's transitive deps
-# (written above) replace it in the resolution. vllm itself cannot
-# be resolved from PyPI for in-development versions, so we test
-# whether Ray's requirements can coexist with vllm's dependency
-# constraints instead.
-sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
-
-# Install uv if needed
-if ! command -v uv &>/dev/null; then
-    echo ">>> Installing uv"
-    pip install uv -q
-fi
-
-# Resolve: given vllm's constraints, can Ray compile a lock file?
-#
-# vllm's dependency constraints are the fixed side — Ray is flexible and
-# can regenerate its lock files. We pass vllm's constraints via -c so
-# the resolver treats them as non-negotiable bounds, then check whether
-# Ray's own requirements can still be satisfied within those bounds.
-echo ""
-echo "============================================================"
-echo ">>> Resolving: Can Ray generate compatible lock files?"
-echo "============================================================"
-
-set +e
-uv pip compile \
-    "${WORK_DIR}/requirements.txt" \
-    "${WORK_DIR}/cloud-requirements.txt" \
-    "${WORK_DIR}/base-test-requirements.txt" \
-    "${WORK_DIR}/llm-requirements.txt" \
-    "${WORK_DIR}/llm-test-requirements.txt" \
-    -c "${WORK_DIR}/vllm-constraints.txt" \
-    --python-version 3.12 \
-    --python-platform x86_64-manylinux_2_31 \
-    --extra-index-url https://download.pytorch.org/whl/cu129 \
-    --index-strategy unsafe-best-match \
-    --unsafe-package setuptools \
-    --unsafe-package ray \
-    --no-header \
-    -o "${WORK_DIR}/resolved.txt" \
-    2>&1
-EXIT_CODE=$?
-set -e
-
-echo ""
-echo "=========================================="
-if [ $EXIT_CODE -eq 0 ]; then
-    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
-    echo ""
-    echo "Key resolved versions:"
-    grep -E '^(protobuf|torch|numpy|transformers)==' \
-        "${WORK_DIR}/resolved.txt" | sort || true
-    echo "=========================================="
-    exit 0
-fi
-
-echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
-echo "This means a fundamental dependency conflict exists that Ray"
-echo "cannot resolve by regenerating its lock files."
-echo "See: https://github.com/vllm-project/vllm/issues/33599"
-echo "=========================================="
-
-# Buildkite annotation
-if [ -f /usr/bin/buildkite-agent ]; then
-    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
-### :warning: Ray Dependency Compatibility Warning
-This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
-Ray would not be able to regenerate its lock files to accommodate this vllm version.
-
-Please check the **Ray Dependency Compatibility Check** step logs for details.
-See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
-EOF
-fi
-
-# Notify Slack if webhook is configured.
-if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
-    echo ">>> Sending Slack notification"
-    # Single quotes are intentional: the f-string expressions are Python, not shell.
-    # shellcheck disable=SC2016
-    PAYLOAD=$(python3 -c '
-import json, os, sys
-pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
-branch = os.getenv("BUILDKITE_BRANCH", "unknown")
-url = os.getenv("BUILDKITE_BUILD_URL", "#")
-data = {
-    "text": ":warning: Ray Dependency Compatibility Check Failed",
-    "blocks": [{
-        "type": "section",
-        "text": {
-            "type": "mrkdwn",
-            "text": (
-                "*:warning: Ray Dependency Compatibility Check Failed*\n"
-                f"PR #{pr} on branch `{branch}` introduces dependencies "
-                f"that cannot be resolved with Ray'\''s requirements.\n"
-                f"<{url}|View Build>"
-            ),
-        },
-    }],
-}
-print(json.dumps(data))
-')
-
-    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-        -H 'Content-type: application/json' \
-        -d "$PAYLOAD")
-    echo "    Slack webhook response: $HTTP_CODE"
-else
-    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
-fi
-
-exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -1,242 +0,0 @@
-#!/bin/bash
-#
-# cherry-pick-from-milestone.sh
-# Find commits from a GitHub milestone that are missing from the current branch
-# and output them in chronological order for cherry-picking.
-#
-# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
-#
-
-set -euo pipefail
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-usage() {
-    cat <<EOF
-Usage: $(basename "$0") <milestone> [options]
-
-Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
-
-Arguments:
-    milestone       The GitHub milestone name (e.g., v0.14.0)
-
-Options:
-    --dry-run       Show the cherry-pick commands without executing (default)
-    --execute       Actually execute the cherry-picks
-    --main-branch   Specify the main branch name (default: main)
-    --help          Show this help message
-
-Examples:
-    $(basename "$0") v0.14.0
-    $(basename "$0") v0.14.0 --dry-run
-    $(basename "$0") v0.14.0 --execute
-    $(basename "$0") v0.14.0 --main-branch master
-EOF
-    exit 1
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1" >&2
-}
-
-# Default values
-MILESTONE=""
-DRY_RUN=true
-MAIN_BRANCH="main"
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --dry-run)
-            DRY_RUN=true
-            shift
-            ;;
-        --execute)
-            DRY_RUN=false
-            shift
-            ;;
-        --main-branch)
-            MAIN_BRANCH="$2"
-            shift 2
-            ;;
-        --help|-h)
-            usage
-            ;;
-        -*)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-        *)
-            if [[ -z "$MILESTONE" ]]; then
-                MILESTONE="$1"
-            else
-                log_error "Unexpected argument: $1"
-                usage
-            fi
-            shift
-            ;;
-    esac
-done
-
-# Validate milestone argument
-if [[ -z "$MILESTONE" ]]; then
-    log_error "Milestone is required"
-    usage
-fi
-
-# Check if we're in a git repository
-if ! git rev-parse --is-inside-work-tree &>/dev/null; then
-    log_error "Not in a git repository"
-    exit 1
-fi
-
-# Check if gh CLI is available
-if ! command -v gh &>/dev/null; then
-    log_error "GitHub CLI (gh) is not installed"
-    exit 1
-fi
-
-# Check if authenticated with gh
-if ! gh auth status &>/dev/null; then
-    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
-    exit 1
-fi
-
-CURRENT_BRANCH=$(git branch --show-current)
-log_info "Current branch: ${CURRENT_BRANCH}"
-log_info "Main branch: ${MAIN_BRANCH}"
-log_info "Milestone: ${MILESTONE}"
-echo ""
-
-# Fetch latest from remote
-log_info "Fetching latest from remote..."
-git fetch origin "$MAIN_BRANCH" --quiet
-
-# Get merged PRs from the milestone, sorted by merge date
-log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
-
-# Store PR data in a temp file
-PR_DATA=$(mktemp)
-trap 'rm -f "$PR_DATA"' EXIT
-
-if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
-    --limit 1000 \
-    --json number,title,mergeCommit,mergedAt \
-    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
-    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
-    log_error "This could be due to:"
-    log_error "  - Milestone does not exist"
-    log_error "  - Network/authentication issues"
-    log_error "  - Invalid milestone name format"
-    exit 1
-fi
-
-if [[ ! -s "$PR_DATA" ]]; then
-    log_warn "No merged PRs found for milestone '${MILESTONE}'"
-    exit 0
-fi
-
-TOTAL_PRS=$(wc -l < "$PR_DATA")
-log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
-echo ""
-
-# Find commits that are missing from current branch
-MISSING_COMMITS=()
-MISSING_INFO=()
-
-while IFS=$'\t' read -r sha pr_number title; do
-    # Skip if SHA is empty or null
-    if [[ -z "$sha" || "$sha" == "null" ]]; then
-        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
-        continue
-    fi
-    
-    # Check if this commit is already in the current branch
-    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
-        log_success "PR #${pr_number} already in branch: ${title:0:60}"
-    else
-        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
-        MISSING_COMMITS+=("$sha")
-        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
-    fi
-done < "$PR_DATA"
-
-echo ""
-
-if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
-    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
-    exit 0
-fi
-
-log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
-echo ""
-
-# Output the cherry-pick commands
-echo "=========================================="
-echo "Cherry-pick commands (in chronological order):"
-echo "=========================================="
-echo ""
-
-for info in "${MISSING_INFO[@]}"; do
-    echo "# $info"
-done
-echo ""
-
-echo "# Run these commands to cherry-pick all missing commits:"
-echo "git cherry-pick ${MISSING_COMMITS[*]}"
-echo ""
-
-# Or one by one
-echo "# Or cherry-pick one at a time:"
-for sha in "${MISSING_COMMITS[@]}"; do
-    echo "git cherry-pick $sha"
-done
-echo ""
-
-# Execute if requested
-if [[ "$DRY_RUN" == false ]]; then
-    echo "=========================================="
-    log_info "Executing cherry-picks..."
-    echo "=========================================="
-    
-    for i in "${!MISSING_COMMITS[@]}"; do
-        sha="${MISSING_COMMITS[$i]}"
-        info="${MISSING_INFO[$i]}"
-        
-        echo ""
-        log_info "Cherry-picking: $info"
-        
-        if git cherry-pick "$sha"; then
-            log_success "Successfully cherry-picked $sha"
-        else
-            log_error "Failed to cherry-pick $sha"
-            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
-            exit 1
-        fi
-    done
-    
-    echo ""
-    log_success "All cherry-picks completed successfully!"
-else
-    echo "=========================================="
-    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
-    echo "Run with --execute to perform the cherry-picks"
-    echo "=========================================="
-fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,14 +3,7 @@
 set -ex

 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with specified prefix
-# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
-# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
-
-# Get tag prefix from argument, default to "nightly-" if not provided
-TAG_PREFIX="${1:-nightly-}"
-
-echo "Cleaning up tags with prefix: $TAG_PREFIX"
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix

 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -52,7 +45,7 @@ get_all_tags() {
        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -16,18 +16,6 @@ from urllib.parse import quote

 import regex as re

-
-def normalize_package_name(name: str) -> str:
-    """
-    Normalize package name according to PEP 503.
-    https://peps.python.org/pep-0503/#normalized-names
-
-    Replace runs of underscores, hyphens, and periods with a single hyphen,
-    and lowercase the result.
-    """
-    return re.sub(r"[-_.]+", "-", name).lower()
-
-
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

@@ -90,13 +78,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
-            version_part, suffix = version.split("+", 1)
-            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
-            # Git hashes and other suffixes are NOT variants
-            if suffix.startswith(("rocm", "cu", "cpu")):
-                variant = suffix
-                version = version_part
-            # Otherwise keep the full version string (variant stays None)
+            version, variant = version.split("+")

    return WheelFileInfo(
        package_name=package_name,
@@ -112,7 +94,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:

 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant subdirectory.
+    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +150,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.

    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

-    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant subdirectory
+            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
@@ -194,7 +176,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant subdirectory
+            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
@@ -224,26 +206,6 @@ def generate_index_and_metadata(
        print("No wheel files found, skipping index generation.")
        return

-    # For ROCm builds: inherit variant from vllm wheel
-    # All ROCm wheels should share the same variant as vllm
-    rocm_variant = None
-    for file in parsed_files:
-        if (
-            file.package_name == "vllm"
-            and file.variant
-            and file.variant.startswith("rocm")
-        ):
-            rocm_variant = file.variant
-            print(f"Detected ROCm variant from vllm: {rocm_variant}")
-            break
-
-    # Apply ROCm variant to all wheels without a variant
-    if rocm_variant:
-        for file in parsed_files:
-            if file.variant is None:
-                file.variant = rocm_variant
-                print(f"Inherited variant '{rocm_variant}' for {file.filename}")
-
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
@@ -294,8 +256,8 @@ def generate_index_and_metadata(

        variant_dir.mkdir(parents=True, exist_ok=True)

-        # gather all package names in this variant (normalized per PEP 503)
-        packages = set(normalize_package_name(f.package_name) for f in files)
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
@@ -307,10 +269,8 @@ def generate_index_and_metadata(
                f.write(project_list_str)

        for package in packages:
-            # filter files belonging to this package only (compare normalized names)
-            package_files = [
-                f for f in files if normalize_package_name(f.package_name) == package
-            ]
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
@@ -331,7 +291,6 @@ if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
-        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@@ -359,12 +318,6 @@ if __name__ == "__main__":
        required=True,
        help="Directory to store generated index files",
    )
-    parser.add_argument(
-        "--wheel-dir",
-        type=str,
-        default=None,
-        help="Directory containing wheel files (default to be same as `version`)",
-    )
    parser.add_argument(
        "--alias-to-default",
        type=str,
@@ -381,13 +334,8 @@ if __name__ == "__main__":
    args = parser.parse_args()

    version = args.version
-    # Allow rocm/ prefix, reject other slashes and all backslashes
-    if "\\" in version:
-        raise ValueError("Version string must not contain backslashes.")
-    if "/" in version and not version.startswith("rocm/"):
-        raise ValueError(
-            "Version string must not contain slashes (except for 'rocm/' prefix)."
-        )
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
@@ -424,7 +372,7 @@ if __name__ == "__main__":

    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")

-    # keep only "official" files for a non-nightly version (specified by cli args)
+    # keep only "official" files for a non-nightly version (specifed by cli args)
    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
    if PY_VERSION_RE.match(version):
        # upload-wheels.sh ensures no "dev" is in args.version
@@ -436,25 +384,9 @@ if __name__ == "__main__":
        print("Nightly version detected, keeping all wheel files.")

    # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{wheel_dir}/<wheel files>
+    # s3://vllm-wheels/{version}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
-    #
-    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
-    #   - rocm/{commit}/  (same as wheels)
-    #   - rocm/nightly/
-    #   - rocm/{version}/
-    # All these are under the "rocm/" prefix, so relative paths should be
-    # relative to "rocm/", not the bucket root.
-    if args.wheel_dir:
-        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
-        wheel_dir = args.wheel_dir.strip().rstrip("/")
-    elif version.startswith("rocm/"):
-        # For rocm/commit, wheel_base_dir should be just the commit part
-        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
-        wheel_dir = version.split("/", 1)[1]
-    else:
-        wheel_dir = version
-    wheel_base_dir = Path(output_dir).parent / wheel_dir
+    wheel_base_dir = Path(output_dir).parent / version
    index_base_dir = Path(output_dir)

    generate_index_and_metadata(
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,57 +1,25 @@
 #!/bin/bash

-# This script runs tests inside the corresponding ROCm docker container.
-# It handles both single-node and multi-node test configurations.
-#
-# Multi-node detection: Instead of matching on fragile group names, we detect
-# multi-node jobs structurally by looking for the bracket command syntax
-# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
-#
-###############################################################################
-# QUOTING / COMMAND PASSING
-#
-# Passing commands as positional arguments ($*) is fragile when the command
-# string itself contains double quotes, e.g.:
-#
-#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
-#
-# The outer shell resolves the nested quotes *before* this script runs, so
-# the script receives mangled input it cannot fully recover.
-#
-# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
-#
-#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
-#   bash run-amd-test.sh
-#
-# Single-quoted assignment preserves all inner double quotes verbatim.
-# The $* path is kept for backward compatibility but callers should migrate.
-###############################################################################
+# This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

 # Export Python path
 export PYTHONPATH=".."

-###############################################################################
-# Helper Functions
-###############################################################################
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done

-wait_for_clean_gpus() {
-  local timeout=${1:-300}
-  local start=$SECONDS
-  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
-  while true; do
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-      echo "GPUs state is \"clean\""
-      return
-    fi
-    if (( SECONDS - start >= timeout )); then
-      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
-      exit 1
-    fi
-    sleep 3
-  done
-}
+echo "--- ROCm info"
+rocminfo

+# cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-
+  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -73,432 +44,193 @@ cleanup_docker() {
  fi
 }

-cleanup_network() {
-  local max_nodes=${NUM_NODES:-2}
-  for node in $(seq 0 $((max_nodes - 1))); do
-    if docker ps -a -q -f name="node${node}" | grep -q .; then
-      docker stop "node${node}" || true
-    fi
-  done
-  if docker network ls | grep -q docker-net; then
-    docker network rm docker-net || true
-  fi
-}
-
-is_multi_node() {
-  local cmds="$1"
-  # Primary signal: NUM_NODES environment variable set by the pipeline
-  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
-    return 0
-  fi
-  # Fallback: detect the bracket syntax structurally
-  # Pattern: [...] && [...] (per-node command arrays)
-  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
-    return 0
-  fi
-  return 1
-}
-
-handle_pytest_exit() {
-  local exit_code=$1
-  if [ "$exit_code" -eq 5 ]; then
-    echo "Pytest exit code 5 (no tests collected) - treating as success."
-    exit 0
-  fi
-  exit "$exit_code"
-}
-
-###############################################################################
-# Pytest marker/keyword re-quoting
-#
-# When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around multi-word pytest -m/-k expressions get stripped:
-#   pytest -v -s -m 'not cpu_test' v1/core
-# becomes:
-#   pytest -v -s -m not cpu_test v1/core
-#
-# pytest then interprets "cpu_test" as a file path, not part of the marker.
-#
-# This function detects unquoted expressions after -m/-k and re-quotes them
-# by collecting tokens until a recognizable boundary is reached:
-#   - test path (contains '/')
-#   - test file (ends with '.py')
-#   - another pytest flag (--xxx or -x single-char flags)
-#   - command separator (&& || ; |)
-#   - environment variable assignment (FOO=bar)
-#
-# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
-# unquoted since they have no spaces and work fine.
-#
-# Already-quoted expressions (containing literal single quotes) are passed
-# through untouched to avoid double-quoting values injected by
-# apply_rocm_test_overrides.
-#
-# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
-# double-quotes stripped by the calling shell (see header comment).
-# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
-###############################################################################
-re_quote_pytest_markers() {
-  local input="$1"
-  local output=""
-  local collecting=false
-  local marker_buf=""
-
-  # Strip backslash-newline continuations, then flatten remaining newlines
-  local flat="${input//$'\\\n'/ }"
-  flat="${flat//$'\n'/ }"
-
-  # Disable globbing to prevent *.py etc. from expanding during read -ra
-  local restore_glob
-  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
-  set -o noglob
-  local -a words
-  read -ra words <<< "$flat"
-  eval "$restore_glob"
-
-  for word in "${words[@]}"; do
-    if $collecting; then
-      # If the token we're about to collect already contains a literal
-      # single quote, the expression was already quoted upstream.
-      # Flush and stop collecting.
-      if [[ "$word" == *"'"* ]]; then
-        if [[ -n "$marker_buf" ]]; then
-          # Should not normally happen (partial buf + quote), flush raw
-          output+="${marker_buf} "
-          marker_buf=""
-        fi
-        output+="${word} "
-        collecting=false
-        continue
-      fi
-
-      local is_boundary=false
-      case "$word" in
-        # Line-continuation artifact
-        "\\")
-          is_boundary=true ;;
-        # Command separators
-        "&&"|"||"|";"|"|")
-          is_boundary=true ;;
-        # Long flags (--ignore, --shard-id, etc.)
-        --*)
-          is_boundary=true ;;
-        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
-        # like "not" which don't start with "-". Also skip -k/-m which
-        # would start a new marker (handled below).
-        -[a-zA-Z])
-          is_boundary=true ;;
-        # Test path (contains /)
-        */*)
-          is_boundary=true ;;
-        # Test file (ends with .py, possibly with ::method)
-        *.py|*.py::*)
-          is_boundary=true ;;
-        # Environment variable assignment preceding a command (FOO=bar)
-        *=*)
-          # Only treat as boundary if it looks like VAR=value, not
-          # pytest filter expressions like num_gpus=2 inside markers
-          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
-            is_boundary=true
-          fi
-          ;;
-      esac
-
-      if $is_boundary; then
-        # Flush the collected marker expression
-        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
-          output+="'${marker_buf}' "
-        else
-          output+="${marker_buf} "
-        fi
-        collecting=false
-        marker_buf=""
-        # Check if this boundary word itself starts a new -m/-k
-        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
-          output+="${word} "
-          collecting=true
-        # Drop stray backslash tokens silently
-        elif [[ "$word" == "\\" ]]; then
-          :
-        else
-          output+="${word} "
-        fi
-      else
-        # Accumulate into marker buffer
-        if [[ -n "$marker_buf" ]]; then
-          marker_buf+=" ${word}"
-        else
-          marker_buf="${word}"
-        fi
-      fi
-    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
-      output+="${word} "
-      collecting=true
-      marker_buf=""
-    else
-      output+="${word} "
-    fi
-  done
-
-  # Flush any trailing marker expression (marker at end of command)
-  if $collecting && [[ -n "$marker_buf" ]]; then
-    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
-      output+="'${marker_buf}'"
-    else
-      output+="${marker_buf}"
-    fi
-  fi
-
-  echo "${output% }"
-}
-
-###############################################################################
-# ROCm-specific pytest command rewrites
-#
-# These apply ignore flags and environment overrides for tests that are not
-# yet supported or behave differently on ROCm hardware. Kept as a single
-# function so new exclusions are easy to add in one place.
-###############################################################################
-
-apply_rocm_test_overrides() {
-  local cmds="$1"
-
-  # --- Model registry filter ---
-  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
-    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-  fi
-
-  # --- LoRA: disable custom paged attention ---
-  if [[ $cmds == *"pytest -v -s lora"* ]]; then
-    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-  fi
-
-  # --- Kernel ignores ---
-  if [[ $cmds == *" kernels/core"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/core/test_fused_quant_layernorm.py \
-    --ignore=kernels/core/test_permute_cols.py"
-  fi
-
-  if [[ $cmds == *" kernels/attention"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/attention/test_attention_selector.py \
-    --ignore=kernels/attention/test_encoder_decoder_attn.py \
-    --ignore=kernels/attention/test_flash_attn.py \
-    --ignore=kernels/attention/test_flashinfer.py \
-    --ignore=kernels/attention/test_prefix_prefill.py \
-    --ignore=kernels/attention/test_cascade_flash_attn.py \
-    --ignore=kernels/attention/test_mha_attn.py \
-    --ignore=kernels/attention/test_lightning_attn.py \
-    --ignore=kernels/attention/test_attention.py"
-  fi
-
-  if [[ $cmds == *" kernels/quantization"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/quantization/test_int8_quant.py \
-    --ignore=kernels/quantization/test_machete_mm.py \
-    --ignore=kernels/quantization/test_block_fp8.py \
-    --ignore=kernels/quantization/test_block_int8.py \
-    --ignore=kernels/quantization/test_marlin_gemm.py \
-    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-    --ignore=kernels/quantization/test_int8_kernel.py"
-  fi
-
-  if [[ $cmds == *" kernels/mamba"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/mamba/test_mamba_mixer2.py \
-    --ignore=kernels/mamba/test_causal_conv1d.py \
-    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-  fi
-
-  if [[ $cmds == *" kernels/moe"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
-  fi
-
-  # --- Entrypoint ignores ---
-  if [[ $cmds == *" entrypoints/openai "* ]]; then
-    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
-    --ignore=entrypoints/openai/test_completion.py \
-    --ignore=entrypoints/openai/test_models.py \
-    --ignore=entrypoints/openai/test_lora_adapters.py \
-    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
-    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
-  fi
-
-  if [[ $cmds == *" entrypoints/llm "* ]]; then
-    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
-    --ignore=entrypoints/llm/test_chat.py \
-    --ignore=entrypoints/llm/test_accuracy.py \
-    --ignore=entrypoints/llm/test_init.py \
-    --ignore=entrypoints/llm/test_prompt_validation.py "}
-  fi
-
-  # Clean up escaped newlines from --ignore appends
-  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
-
-  echo "$cmds"
-}
-
-###############################################################################
-# Main
-###############################################################################
-
-# --- GPU initialization ---
-echo "--- Confirming Clean Initial State"
-wait_for_clean_gpus
-
-echo "--- ROCm info"
-rocminfo
-
-# --- Docker housekeeping ---
+# Call the cleanup docker function
 cleanup_docker

 echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-wait_for_clean_gpus

-# --- Pull test image ---
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"

 remove_docker_container() {
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

-# --- Prepare commands ---
 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-# ---- Command source selection ----
-# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
-# Fall back to $* for backward compatibility, but warn that inner
-# double-quotes will have been stripped by the calling shell.
-if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
-  commands="${VLLM_TEST_COMMANDS}"
-  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
-else
-  commands="$*"
-  if [[ -z "$commands" ]]; then
-    echo "Error: No test commands provided." >&2
-    echo "Usage:" >&2
-    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
-    echo "  Legacy:     bash $0 \"commands here\"" >&2
-    exit 1
-  fi
-  echo "Commands sourced from positional args (legacy mode)"
-  echo "WARNING: Inner double-quotes in the command string may have been"
-  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
-  echo "  export VLLM_TEST_COMMANDS='your commands here'"
-  echo "  bash $0"
+commands=$@
+echo "Commands:$commands"
+
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-echo "Raw commands: $commands"
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}

-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-echo "After re-quoting: $commands"
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi

-commands=$(apply_rocm_test_overrides "$commands")
-echo "Final commands: $commands"
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi

+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

-# Verify GPU access
+# Test that we're launching on the machine that has
+# proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi

-# --- RDMA device passthrough (conditional) ---
-# If the host has RDMA devices, pass them through so tests like
-# test_moriio_connector can access ibverbs. On hosts without RDMA
-# hardware the tests will gracefully skip via _rdma_available().
-RDMA_FLAGS=""
-if [ -d /dev/infiniband ]; then
-  echo "RDMA devices detected on host, enabling passthrough"
-  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
-else
-  echo "No RDMA devices found on host, RDMA tests will be skipped"
-fi
-
-# --- Route: multi-node vs single-node ---
-if is_multi_node "$commands"; then
-  echo "--- Multi-node job detected"
-  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-
-  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-  #   BASH_REMATCH[1] = prefix (everything before first bracket)
-  #   BASH_REMATCH[2] = comma-separated node0 commands
-  #   BASH_REMATCH[3] = comma-separated node1 commands
-  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
-    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-    echo "PREFIX: ${prefix}"
-
-    export composite_command="(command rocm-smi || true)"
-    saved_IFS=$IFS
-    IFS=','
-    read -ra node0 <<< "${BASH_REMATCH[2]}"
-    read -ra node1 <<< "${BASH_REMATCH[3]}"
-    IFS=$saved_IFS
-
-    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
-      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
    fi
-
-    for i in "${!node0[@]}"; do
-      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
-      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
-
-      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-      echo "COMMANDS: ${step_cmd}"
-      composite_command="${composite_command} && ${step_cmd}"
-    done
-
-    /bin/bash -c "${composite_command}"
-    exit_code=$?
-    cleanup_network
-    handle_pytest_exit "$exit_code"
-  else
-    echo "Multi-node job detected but failed to parse bracket command syntax."
-    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
-    echo "Got: $commands"
-    cleanup_network
-    exit 111
-  fi
+  done
 else
-  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    $RDMA_FLAGS \
-    --network=host \
-    --shm-size=16gb \
-    --group-add "$render_gid" \
-    --rm \
-    -e HF_TOKEN \
-    -e AWS_ACCESS_KEY_ID \
-    -e AWS_SECRET_ACCESS_KEY \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e "HF_HOME=${HF_MOUNT}" \
-    -e "PYTHONPATH=${MYPYTHONPATH}" \
-    --name "${container_name}" \
-    "${image_name}" \
-    /bin/bash -c "${commands}"
-
-  exit_code=$?
-  handle_pytest_exit "$exit_code"
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,43 +0,0 @@
-#!/bin/bash
-set -euox pipefail
-export VLLM_CPU_CI_ENV=0
-
-echo "--- PP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename tp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/tp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
-
-echo "--- DP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename dp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/dp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,19 +2,118 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -euox pipefail
+set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+# used for TP/PP E2E test
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
-IMAGE_NAME="cpu-test-$NUMA_NODE"
-TIMEOUT_VAL=$1
-TEST_COMMAND=$2

-# building the docker image
-echo "--- :docker: Building Docker image"
-docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/test_onednn.py"
+
+  # Run basic model test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    # Note: disable until supports V1
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
+
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/lora/test_qwen2vl.py"
+
+  # online serving: tp+pp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,49 +1,21 @@
 #!/bin/bash

-# This script builds the HPU docker image and runs the offline inference inside the container.
+# This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-#
-# vllm-gaudi compatibility pinning:
-#   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
-#   When upstream vllm changes its API, the plugin may break before it has been updated.
-#   To handle this, the vllm-gaudi repository maintains a file:
-#     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
-#   The first line of that file controls what version of vllm is used inside the Docker image:
-#     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
-#     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
-#                         the test to a known-compatible baseline.
-#   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail

-# Fetch the vllm community commit reference from vllm-gaudi (first line only).
-VLLM_COMMUNITY_COMMIT=$(curl -s \
-  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
-  | head -1 | tr -d '\n')
-
-echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
-
 # Try building the docker image
-image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
-container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t "${image_name}" -f - .
+cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

-# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
-# to the version known to be compatible with vllm-gaudi. When the value is "latest",
-# the current checkout (the Buildkite CI commit) is used unchanged.
-RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
-      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
-    fi
-
 WORKDIR /workspace/vllm

 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
-RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
+RUN VLLM_TARGET_DEVICE=empty pip install .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -64,20 +36,15 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f "${container_name}" || true; }
+remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name="${container_name}" --network=host \
+docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  -e VLLM_SKIP_WARMUP=true \
-  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-  -e PT_HPU_LAZY_MODE=1 \
-  "${image_name}" \
-  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
-'
+  hpu-plugin-v1-test-env \
+  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,7 +41,6 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
-    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -49,8 +48,9 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
+get_config
 # Check if the function call was successful. If not, exit the script.
-if ! get_config; then
+if [ $? -ne 0 ]; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p "${builder_cache_dir}"
+mkdir -p ${builder_cache_dir}

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t "${image_name}" -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns one argument per line: --device, /dev/davinciX, ...
+#   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
        return 1
    fi

+    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        printf '%s\n' "--device"
-        printf '%s\n' "/dev/davinci${dev_idx}"
+        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
 }

-mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p "${model_cache_dir}"
+mkdir -p ${model_cache_dir}
 docker run \
-    "${device_args[@]}" \
+    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v "${model_cache_dir}":/root/.cache/modelscope \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t "${image_name}" -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
@@ -38,18 +38,15 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
+    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
-# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
-# otherwise they will be cleaned up together with the main "nightly" tags.
-
-TAG_VARIANT="$1"
-if [ -n "$TAG_VARIANT" ]; then
-    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
-    TAG_NAME="$TAG_VARIANT-nightly"
-else
-    ORIG_TAG_SUFFIX=""
-    TAG_NAME="nightly"
-fi
-
-ORIG_TAG_NAME="$BUILDKITE_COMMIT"
-
-echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
-
-# pull original arch-dependent images from AWS ECR Public
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
-# tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
-# push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
-# push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -2,17 +2,6 @@

 set -euox pipefail

-# To detect ROCm
-# Check multiple indicators:
-if [ -e /dev/kfd ] || \
-    [ -d /opt/rocm ] || \
-    command -v rocm-smi &> /dev/null || \
-    [ -n "${ROCM_HOME:-}" ]; then
-    IS_ROCM=1
-else
-    IS_ROCM=0
-fi
-
 if [[ $# -lt 4 ]]; then
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
@@ -37,18 +26,13 @@ for command in "${COMMANDS[@]}"; do
    echo "$command"
 done

-
 start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
 }

 start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        if [ "$IS_ROCM" -eq 1 ]; then
-            GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
-        else
-            GPU_DEVICES='--gpus "device='
-        fi
+        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
@@ -56,9 +40,7 @@ start_nodes() {
                GPU_DEVICES+=','
            fi
        done
-        if [ "$IS_ROCM" -eq 0 ]; then
-            GPU_DEVICES+='"'
-        fi
+        GPU_DEVICES+='"'

        # start the container in detached mode
        # things to note:
@@ -67,7 +49,7 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,6 +43,7 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -51,14 +52,13 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend "$BACK" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# Nightly e2e test for prefetch offloading with a MoE model.
-# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
-# and validates GSM8K accuracy matches baseline (no offloading).
-#
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-Lite"
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-vllm serve "$MODEL" \
-  --max-model-len 2048 \
-  --offload-group-size 8 \
-  --offload-num-in-group 2 \
-  --offload-prefetch-step 1 \
-  --offload-params w13_weight w2_weight \
-  --port "$PORT" &
-SERVER_PID=$!
-wait_for_server "$PORT"
-
-TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
-python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
-python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
-PY
-
-cleanup
-SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -44,23 +44,23 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -18,18 +18,15 @@ wait_for_server() {

 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

-# Set BACKENDS and platform-specific args based on platform
+# Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
-  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-  PLATFORM_ARGS=()
 fi

 cleanup() {
@@ -46,25 +43,24 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
-    "${PLATFORM_ARGS[@]}" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,11 +9,10 @@ ENV_FILE=$1

 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-# shellcheck source=/dev/null
-source "$ENV_FILE"
+source $ENV_FILE

 remove_docker_container() { 
-    docker rm -f "$CONTAINER_NAME" || true;
+    docker rm -f $CONTAINER_NAME || true;
 }

 trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file "$ENV_FILE" \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL="$MODEL" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
- --name "$CONTAINER_NAME" \
+ --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-vllm serve "$MODEL" \
+vllm serve $MODEL \
 --seed 42 \
- --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
- --download_dir "$DOWNLOAD_DIR" \
- --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &


 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for _ in {1..120}; do
+for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model "$MODEL"  \
+    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len "$OUTPUT_LEN" \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"

 echo "completed..."
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -1,227 +0,0 @@
-#!/bin/bash
-#
-# trigger-ci-build.sh
-# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
-# with RUN_ALL=1 and NIGHTLY=1 environment variables.
-#
-# Usage: ./trigger-ci-build.sh [options]
-#
-# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
-#
-# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
-#
-
-set -euo pipefail
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-# Default configuration
-PIPELINE="ci"
-DRY_RUN=true
-
-usage() {
-    cat <<EOF
-Usage: $(basename "$0") [options]
-
-Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
-Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
-
-SAFETY: Dry-run by default. Use --execute to actually trigger a build.
-
-Options:
-    --execute       Actually trigger the build (default: dry-run)
-    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
-    --commit        Override commit SHA (default: current HEAD)
-    --branch        Override branch name (default: current branch)
-    --message       Custom build message (default: auto-generated)
-    --help          Show this help message
-
-Prerequisites:
-    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
-    - bk configured: bk configure
-
-Examples:
-    $(basename "$0")                        # Dry-run, show what would happen
-    $(basename "$0") --execute              # Actually trigger the build
-    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
-EOF
-    exit 1
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1" >&2
-}
-
-# Parse arguments
-COMMIT=""
-BRANCH=""
-MESSAGE=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --execute)
-            DRY_RUN=false
-            shift
-            ;;
-        --pipeline)
-            PIPELINE="$2"
-            shift 2
-            ;;
-        --commit)
-            COMMIT="$2"
-            shift 2
-            ;;
-        --branch)
-            BRANCH="$2"
-            shift 2
-            ;;
-        --message)
-            MESSAGE="$2"
-            shift 2
-            ;;
-        --help|-h)
-            usage
-            ;;
-        -*)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-        *)
-            log_error "Unexpected argument: $1"
-            usage
-            ;;
-    esac
-done
-
-# Check if bk CLI is installed
-if ! command -v bk &>/dev/null; then
-    log_error "Buildkite CLI (bk) is not installed"
-    echo ""
-    echo "Install with:"
-    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
-    echo ""
-    echo "Then configure:"
-    echo "  bk configure"
-    exit 1
-fi
-
-# Check if we're in a git repository
-if ! git rev-parse --is-inside-work-tree &>/dev/null; then
-    log_error "Not in a git repository"
-    exit 1
-fi
-
-# Get current commit and branch if not overridden
-if [[ -z "$COMMIT" ]]; then
-    COMMIT=$(git rev-parse HEAD)
-fi
-
-if [[ -z "$BRANCH" ]]; then
-    BRANCH=$(git branch --show-current)
-    if [[ -z "$BRANCH" ]]; then
-        # Detached HEAD state - try to get branch from ref
-        BRANCH=$(git rev-parse --abbrev-ref HEAD)
-    fi
-fi
-
-# Generate default message if not provided
-if [[ -z "$MESSAGE" ]]; then
-    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
-    MESSAGE="[Manual] ${COMMIT_MSG}"
-fi
-
-# Safety check: Verify the commit exists on the remote
-log_info "Verifying commit exists on remote..."
-git fetch origin --quiet 2>/dev/null || true
-
-# Check if commit is reachable from any remote branch
-REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
-if [[ -z "$REMOTE_BRANCHES" ]]; then
-    log_error "Commit ${COMMIT} does not exist on any remote branch!"
-    echo ""
-    echo "The CI system will fail to checkout this commit."
-    echo "Please push your changes first:"
-    echo ""
-    echo "  git push origin ${BRANCH}"
-    echo ""
-    exit 1
-fi
-
-log_success "Commit found on remote branches:"
-echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
-if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
-    echo "  ... and more"
-fi
-echo ""
-
-log_info "Pipeline: ${PIPELINE}"
-log_info "Branch: ${BRANCH}"
-log_info "Commit: ${COMMIT}"
-log_info "Message: ${MESSAGE}"
-log_info "Environment: RUN_ALL=1, NIGHTLY=1"
-echo ""
-
-# Build the command
-CMD=(bk build create
-    -y
-    -w
-    -i
-    --pipeline "${PIPELINE}"
-    --commit "${COMMIT}"
-    --branch "${BRANCH}"
-    --message "${MESSAGE}"
-    --env "RUN_ALL=1"
-    --env "NIGHTLY=1"
-)
-
-if [[ "$DRY_RUN" == true ]]; then
-    echo "=========================================="
-    log_warn "DRY-RUN MODE - No build will be triggered"
-    echo "=========================================="
-    echo ""
-    echo "Command that would be executed:"
-    echo ""
-    # Escape single quotes in values for safe shell display
-    escape_for_shell() {
-        printf '%s' "$1" | sed "s/'/'\\\\''/g"
-    }
-    echo "  bk build create \\"
-    echo "    -y \\"
-    echo "    -w \\"
-    echo "    -i \\"
-    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
-    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
-    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
-    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
-    echo "    --env 'RUN_ALL=1' \\"
-    echo "    --env 'NIGHTLY=1'"
-    echo ""
-    echo "=========================================="
-    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
-    echo ""
-    echo "  $0 --execute"
-    echo "=========================================="
-    exit 0
-fi
-
-log_info "Triggering build..."
-
-# Execute the command - bk will print the URL and open browser
-"${CMD[@]}"
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-BUCKET="vllm-wheels"
-SUBPATH=$BUILDKITE_COMMIT
-S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
-
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
-
-echo "Release version from Buildkite: $RELEASE_VERSION"
-
-if [[ -z "$GIT_VERSION" ]]; then
-    echo "[FATAL] Not on a git tag, cannot create release."
-    exit 1
-else
-    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
-fi
-# sanity check for version mismatch
-if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
-  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
-    echo "[WARNING] Force release and ignore version mismatch"
-  else
-    echo "[FATAL] Release version from Buildkite does not match Git version."
-    exit 1
-  fi
-fi
-PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
-
-# check pypi token
-if [[ -z "$PYPI_TOKEN" ]]; then
-  echo "[FATAL] PYPI_TOKEN is not set."
-  exit 1
-else
-  export TWINE_USERNAME="__token__"
-  export TWINE_PASSWORD="$PYPI_TOKEN"
-fi
-
-set -x # avoid printing secrets above
-
-# install twine from pypi
-python3 -m venv /tmp/vllm-release-env
-source /tmp/vllm-release-env/bin/activate
-pip install twine
-python3 -m twine --version
-
-# copy release wheels to local directory
-DIST_DIR=/tmp/vllm-release-dist
-echo "Existing wheels on S3:"
-aws s3 ls "$S3_COMMIT_PREFIX"
-echo "Copying wheels to local directory"
-mkdir -p $DIST_DIR
-# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
-aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
-echo "Wheels copied to local directory"
-# generate source distribution using setup.py
-python setup.py sdist --dist-dir=$DIST_DIR
-ls -la $DIST_DIR
-
-SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
-echo "Found sdist: $SDIST_FILE"
-
-# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
-PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [[ -z "$PYPI_WHEEL_FILES" ]]; then
-  echo "No default variant wheels found, quitting..."
-  exit 1
-fi
-
-python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -1,151 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Upload ROCm wheels to S3 with proper index generation
-#
-# Required environment variables:
-#   AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
-#   S3_BUCKET (default: vllm-wheels)
-#
-# S3 path structure:
-#   s3://vllm-wheels/rocm/{commit}/     - All wheels for this commit
-#   s3://vllm-wheels/rocm/nightly/      - Index pointing to latest nightly
-#   s3://vllm-wheels/rocm/{version}/    - Index for release versions
-
-set -ex
-
-# ======== Configuration ========
-BUCKET="${S3_BUCKET:-vllm-wheels}"
-ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
-S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
-INDICES_OUTPUT_DIR="rocm-indices"
-PYTHON="${PYTHON_PROG:-python3}"
-
-# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
-MANYLINUX_VERSION="manylinux_2_35"
-
-echo "========================================"
-echo "ROCm Wheel Upload Configuration"
-echo "========================================"
-echo "S3 Bucket: $BUCKET"
-echo "S3 Path: $ROCM_SUBPATH"
-echo "Commit: $BUILDKITE_COMMIT"
-echo "Branch: $BUILDKITE_BRANCH"
-echo "========================================"
-
-# ======== Part 0: Setup Python ========
-
-# Detect if python3.12+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
-if [[ "$has_new_python" -eq 0 ]]; then
-    # Use new python from docker
-    # Use --user to ensure files are created with correct ownership (not root)
-    docker pull python:3-slim
-    PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
-fi
-
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
-
-# ======== Part 1: Collect and prepare wheels ========
-
-# Collect all wheels
-mkdir -p all-rocm-wheels
-cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
-cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
-
-WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
-echo "Total wheels to upload: $WHEEL_COUNT"
-
-if [ "$WHEEL_COUNT" -eq 0 ]; then
-    echo "ERROR: No wheels found to upload!"
-    exit 1
-fi
-
-# Rename linux to manylinux in wheel filenames
-for wheel in all-rocm-wheels/*.whl; do
-    if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
-        new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
-        mv -- "$wheel" "$new_wheel"
-        echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
-    fi
-done
-
-echo ""
-echo "Wheels to upload:"
-ls -lh all-rocm-wheels/
-
-# ======== Part 2: Upload wheels to S3 ========
-
-echo ""
-echo "Uploading wheels to $S3_COMMIT_PREFIX"
-for wheel in all-rocm-wheels/*.whl; do
-    aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-done
-
-# ======== Part 3: Generate and upload indices ========
-
-# List existing wheels in commit directory
-echo ""
-echo "Generating indices..."
-obj_json="rocm-objects.json"
-aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
-
-mkdir -p "$INDICES_OUTPUT_DIR"
-
-# Use the existing generate-nightly-index.py
-# HACK: Replace regex module with stdlib re (same as CUDA script)
-sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-
-$PYTHON .buildkite/scripts/generate-nightly-index.py \
-    --version "$ROCM_SUBPATH" \
-    --current-objects "$obj_json" \
-    --output-dir "$INDICES_OUTPUT_DIR" \
-    --comment "ROCm commit $BUILDKITE_COMMIT"
-
-# Upload indices to commit directory
-echo "Uploading indices to $S3_COMMIT_PREFIX"
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
-
-# Update rocm/nightly/ if on main branch and not a PR
-if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
-    echo "Updating rocm/nightly/ index..."
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
-fi
-
-# Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
-if [ -n "$VLLM_WHEEL" ]; then
-    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-    echo "Version in wheel: $VERSION"
-    PURE_VERSION="${VERSION%%+*}"
-    PURE_VERSION="${PURE_VERSION%%.rocm}"
-    echo "Pure version: $PURE_VERSION"
-
-    if [[ "$VERSION" != *"dev"* ]]; then
-        echo "Updating rocm/$PURE_VERSION/ index..."
-        aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
-    fi
-fi
-
-# ======== Part 4: Summary ========
-
-echo ""
-echo "========================================"
-echo "ROCm Wheel Upload Complete!"
-echo "========================================"
-echo ""
-echo "Wheels available at:"
-echo "  s3://$BUCKET/$ROCM_SUBPATH/"
-echo ""
-echo "Install command (by commit):"
-echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
-echo ""
-if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
-    echo "Install command (nightly):"
-    echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
-fi
-echo ""
-echo "Wheel count: $WHEEL_COUNT"
-echo "========================================"
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -76,15 +76,16 @@ mkdir -p "$INDICES_OUTPUT_DIR"
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-alias_args=()
-if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+    alias_arg=""
 fi

 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg

 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,8 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
-    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,10 +4,8 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  device: h100
+  gpu: h100
  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -15,11 +13,9 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  device: b200
+  gpu: b200
  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,15 +17,3 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
-
- label: Attention Benchmarks Smoke Test (B200)
-  device: b200
-  num_gpus: 2
-  optional: true
-  working_dir: "/vllm-workspace/"
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - benchmarks/attention_benchmarks/
-  - vllm/v1/attention/
-  commands:
-  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,200 +2,56 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
-  timeout_in_minutes: 50
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/model_executor/layers/
-  - vllm/compilation/
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - tests/compile/correctness_e2e/test_sequence_parallel.py
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: Sequence Parallel Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-
- label: AsyncTP Correctness Tests (2xH100)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
-
- label: Distributed Compile Unit Tests (2xH100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/compilation/
-  - vllm/model_executor/layers
-  - tests/compile/passes/distributed/
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -s -v tests/compile/passes/distributed
-
- label: Fusion and Compile Unit Tests (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
-    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E Quick (H100)
-  timeout_in_minutes: 15
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
-
- label: Fusion E2E Config Sweep (H100)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
-
- label: Fusion E2E Config Sweep (B200)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 1
+  gpu: b200
  optional: true
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
-
- label: Fusion E2E TP2 Quick (H100)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
-
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: h100
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
-
- label: Fusion E2E TP2 (B200)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  device: b200
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,7 +9,6 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -16,9 +16,9 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 60
+  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -47,13 +47,14 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -62,7 +63,6 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -97,19 +97,14 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
-  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  # NEW rlhf examples
-  - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  device: h100
-  num_devices: 8
+  gpu: h100
+  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -125,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -138,23 +133,26 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H100)
-  timeout_in_minutes: 15
-  device: h100
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  device: b200
+  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -163,10 +161,8 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  num_nodes: 2
-  no_plugin: true
-  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -175,45 +171,23 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"

 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh

- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: Pipeline + Context Parallelism (4 GPUs)
+- label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -222,4 +196,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,36 +4,56 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  device: b200
+  gpu: b200
  optional: true
-  num_devices: 2
+  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 60
-  device: h100
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
  optional: true
-  num_devices: 1
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,7 +14,7 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: V1 e2e + engine (1 GPU)
+- label: V1 e2e + engine
  timeout_in_minutes: 45
  source_file_dependencies:
    - vllm/
@@ -23,48 +23,4 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
-    # need to untangle use (implicit) use of spawn/fork across the tests.
-    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    # Run the rest of v1/engine tests
-    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
-
- label: V1 e2e (2 GPUs)
-  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-    - vllm/
-    - tests/v1/e2e
-  commands:
-    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
-  mirror:
-    amd:
-      device: mi325_2
-      depends_on:
-      - image-build-amd
-
- label: V1 e2e (4 GPUs)
-  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
-  optional: true
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/
-    - tests/v1/e2e
-  commands:
-    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
-  mirror:
-    amd:
-      device: mi325_4
-      depends_on:
-      - image-build-amd
+    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration (LLM)
  timeout_in_minutes: 40
@@ -24,13 +24,8 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

- label: Entrypoints Integration (API Server 1)
+- label: Entrypoints Integration (API Server)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@@ -39,22 +34,10 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

- label: Entrypoints Integration (API Server 2)
-  timeout_in_minutes: 130
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
-  - tests/tool_use
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-  - pytest -v -s tool_use

 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
@@ -65,20 +48,7 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

- label: Entrypoints Integration (Responses API)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/openai/responses
-  commands:
-  - pytest -v -s entrypoints/openai/responses

 - label: Entrypoints V1
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,25 +14,10 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
-
- label: Elastic EP Scaling Test
-  timeout_in_minutes: 20
-  device: b200
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/compilation/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_elastic_ep.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,9 +15,8 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
+  - vllm/attention
  - vllm/v1/attention
-    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -44,8 +43,7 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels Mamba Test
@@ -59,8 +57,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  device: h100
-  num_devices: 1
+  gpu: h100
+  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -71,7 +69,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -79,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -87,13 +85,13 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
-  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@@ -116,54 +114,4 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    # e2e
-    - pytest -v -s tests/models/quantization/test_nvfp4.py
-
- label: Kernels Helion Test
-  timeout_in_minutes: 30
-  device: h100
-  source_file_dependencies:
-  - vllm/utils/import_utils.py
-  - tests/kernels/helion/
-  commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
-
- 
- label: Kernels FP8 MoE Test (1 H100)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 1
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_cutlass_moe.py
-    - pytest -v -s kernels/moe/test_flashinfer.py
-    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
-    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
-    - pytest -v -s kernels/moe/test_moe.py
-    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
-    - pytest -v -s kernels/moe/test_block_int8.py
-    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
-    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
-
- label: Kernels FP8 MoE Test (2 H100s)
-  timeout_in_minutes: 90
-  device: h100
-  num_devices: 2
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
-    - pytest -v -s kernels/moe/test_deepep_moe.py
-
- label: Kernels Fp4 MoE Test (B200)
-  timeout_in_minutes: 60
-  device: b200
-  num_devices: 1
-  optional: true
-  commands:
-    - pytest -v -s kernels/moe/test_cutedsl_moe.py
-    - pytest -v -s kernels/moe/test_flashinfer_moe.py
-    - pytest -v -s kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -9,24 +9,24 @@ steps:
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

-# - label: LM Eval Large Models (4 GPUs)(A100)
-#   device: a100
-#   optional: true
-#   num_devices: 4
-#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-#   source_file_dependencies:
-#   - csrc/
-#   - vllm/model_executor/layers/quantization
-#   commands:
-#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,65 +37,10 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  device: b200
+  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
- label: LM Eval Large Models (H200)
-  timeout_in_minutes: 60
-  device: h200
-  optional: true
-  num_devices: 8
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
-
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
-  
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
-
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
-
- label: GPQA Eval (GPT-OSS) (H100)
-  timeout_in_minutes: 120
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
-
- label: GPQA Eval (GPT-OSS) (B200)
-  timeout_in_minutes: 120
-  device: b200
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
@@ -22,8 +22,6 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # Alot of these tests are on the edge of OOMing
-    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,7 +9,6 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -17,8 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    # TODO: create another `optional` test group for slow tests
-    - pytest -v -s -m 'not slow_test' v1/spec_decode
+    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -27,19 +25,13 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: V1 Others (CPU)
-  depends_on:
-    - image-build-cpu
+  depends_on: ~
  source_file_dependencies:
    - vllm/
    - tests/v1
-  device: cpu
+  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -79,7 +71,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -90,7 +82,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -115,48 +107,53 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
-  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
-  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: 
-  - image-build-cpu
+  depends_on: ~
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/test_ray_env.py
  - tests/multimodal
-  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  device: cpu
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
@@ -165,18 +162,4 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
-  
- label: Acceptance Length Test (Large Models) # optional
-  timeout_in_minutes: 25
-  gpu: h100
-  optional: true
-  num_gpus: 1
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/v1/spec_decode/
-  - vllm/model_executor/models/mlp_speculator.py
-  - tests/v1/spec_decode/test_acceptance_length.py
-  commands:
-    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
-    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,22 +4,22 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
-  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset

 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/test_initialization.py
-  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@@ -31,27 +31,18 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
-  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-    
+    - pytest -v -s models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  device: cpu
+  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -37,7 +40,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -52,21 +56,13 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -76,20 +72,17 @@ steps:

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -14,14 +14,11 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Processor Test (CPU)
-  depends_on: 
-  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  - tests/models/registry.py
-  device: cpu
+  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
@@ -31,7 +28,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -72,3 +68,12 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
@@ -19,10 +19,6 @@ steps:
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
-  # test bge_m3_sparse io_processor plugin
-  - pip install -e ./plugins/bge_m3_sparse_plugin
-  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
-  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 10
+  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -13,20 +13,10 @@ steps:
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  # However, find does not normally propagate error codes, so we combine it with xargs
-  # (using -0 for proper path handling)
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
-
- label: PyTorch Compilation Passes Unit Tests
-  timeout_in_minutes: 20
-  source_file_dependencies:
-    - vllm/
-    - tests/compile/passes
-  commands:
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 35
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -38,13 +28,16 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 30
+  timeout_in_minutes: 40
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -1,16 +0,0 @@
-group: Ray Compatibility
-depends_on:
-  - image-build
-steps:
- label: Ray Dependency Compatibility Check
-  # Informational only — does not block the pipeline.
-  # If this fails, it means the PR introduces a dependency that
-  # conflicts with Ray's dependency constraints.
-  # See https://github.com/vllm-project/vllm/issues/33599
-  soft_fail: true
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - requirements/
-  - setup.py
-  commands:
-  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,10 +12,3 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s samplers
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,13 @@
+group: Tool use
+depends_on: 
+  - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s tool_use
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

-# - label: Weight Loading Multiple GPU - Large Models # optional
-#   working_dir: "/vllm-workspace/tests"
-#   num_devices: 2
-#   device: a100
-#   optional: true
-#   source_file_dependencies:
-#   - vllm/
-#   - tests/weight_loading
-#   commands:
-#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+#   - "vllm/v1/attetion/*.py"
+#   - "vllm/v1/core/*.py"
+exclude:
+  - "**/*.py"
+
+scan:
+  functions: true        # check free functions and methods
+  classes: true          # check classes/dataclasses
+  public_only: true      # ignore names starting with "_" at any level
+
+annotations:
+  include:               # decorators that force‑include a symbol
+    - name: "bc_linter_include"  # matched by simple name or dotted suffix
+      propagate_to_members: false # for classes, include methods/inner classes
+  exclude:               # decorators that force‑exclude a symbol
+    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
+      propagate_to_members: true  # for classes, exclude methods/inner classes
+
+excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,66 +2,42 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
-/vllm/lora @jeejeelee
-/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
+/vllm/attention @LucasWilkinson
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
+/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @heheda12345
-
-# Entrypoints
-/vllm/entrypoints/anthropic @mgoin @DarkLight1337
-/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
-/vllm/entrypoints/mcp @heheda12345
-/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
-/vllm/entrypoints/pooling @noooop
-/vllm/entrypoints/sagemaker @DarkLight1337
-/vllm/entrypoints/serve @njhill
-/vllm/entrypoints/*.py @njhill
-/vllm/entrypoints/chat_utils.py @DarkLight1337
-/vllm/entrypoints/llm.py @DarkLight1337
-
-# Input/Output Processing
-/vllm/sampling_params.py @njhill @NickLucche
-/vllm/pooling_params.py @noooop @DarkLight1337
-/vllm/tokenizers @DarkLight1337 @njhill
-/vllm/renderers @DarkLight1337 @njhill
-/vllm/reasoning @aarnphm @chaunceyjiang
-/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1/attention @LucasWilkinson @MatthewBonanni
-/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/engine @njhill
-/vllm/v1/executor @njhill
-/vllm/v1/worker @njhill
-/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
+/vllm/v1/offloading @ApostaC

 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon @njhill
-/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/worker/gpu @WoosukKwon

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
@@ -77,13 +53,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC @orozery
-/tests/v1/kv_offload @ApostaC @orozery
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
 /tests/v1/determinism @yewentao256 

 # Transformers modeling backend
@@ -136,19 +112,19 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
-/vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

 # Kernels
-/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
+/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/attention/ops/triton_unified_attention.py @tdoublep

 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /vllm/**/*rocm* @tjtanaa
 /docker/Dockerfile.rocm* @gshtras @tjtanaa
 /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
 /csrc/rocm @gshtras @tjtanaa
 /requirements/*rocm* @tjtanaa
@@ -173,8 +149,10 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
-/vllm/model_executor/layers/pooler @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop

 # Security guide and policies
 /docs/usage/security.md @russellb
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -222,10 +222,10 @@ pull_request_rules:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
      - files~=^requirements/rocm.*\.txt
+      - files~=^vllm/attention/backends/rocm.*\.py
+      - files~=^vllm/attention/ops/rocm.*\.py
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
-      - files~=^vllm/v1/attention/backends/rocm.*\.py
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
-      - files~=^vllm/v1/attention/ops/rocm.*\.py
      - files~=^tests/kernels/.*_rocm.*\.py
      - files=vllm/platforms/rocm.py
      - title~=(?i)AMD
@@ -235,20 +235,6 @@ pull_request_rules:
      add:
        - rocm

- name: label-cpu
-  description: Automatically apply cpu label
-  conditions:
-    - label != stale
-    - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
-  actions:
-    label:
-      add:
-        - cpu
-    assign:
-      users:
-        - "fadara01"
-        - "aditew01"
-
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
@@ -259,7 +245,8 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/structured_outputs/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
@@ -348,18 +335,6 @@ pull_request_rules:
      add:
        - tool-calling

- name: auto-rebase if approved, ready, and 40 commits behind main
-  conditions:
-    - base = main
-    - label=ready
-    - "#approved-reviews-by >= 1"
-    - "#commits-behind >= 40"
-    - -closed
-    - -draft
-    - -conflict
-  actions:
-    rebase: {}
-
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
    - label != stale
@@ -413,18 +388,6 @@ pull_request_rules:
      remove:
        - needs-rebase

- name: label-bug
-  description: Automatically apply bug label
-  conditions:
-    - label != stale
-    - or:
-      - title~=(?i)\bbug\b
-      - title~=(?i)\bbugfix\b
-  actions:
-    label:
-      add:
-        - bug
-
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  bc_lint:
+    if: github.repository_owner == 'vllm-project'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+          config_dir: .github
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,7 +19,6 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
-          cache: 'pip'

      - name: Install Python dependencies
        run: |
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -29,9 +29,8 @@ jobs:

      - name: Install dependencies and build vLLM
        run: |
-          uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
-          uv pip install -e . --no-build-isolation
+          uv pip install -e .
        env:
          CMAKE_BUILD_PARALLEL_LEVEL: 4

--- a/.gitignore
+++ b/.gitignore
@@ -3,15 +3,10 @@

 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-!vllm/vllm_flash_attn/__init__.py
-!vllm/vllm_flash_attn/flash_attn_interface.py

 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*

-# FlashMLA interface copied from source
-vllm/third_party/flashmla/flash_mla_interface.py
-
 # triton jit
 .triton

@@ -196,9 +191,6 @@ CLAUDE.md
 AGENTS.md
 .codex/

-# Cursor
-.cursor/
-
 # DS Store
 .DS_Store

@@ -235,11 +227,3 @@ ep_kernels_workspace/

 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
-
-# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
-vllm/grpc/vllm_engine_pb2.py
-vllm/grpc/vllm_engine_pb2_grpc.py
-vllm/grpc/vllm_engine_pb2.pyi
-
-# Ignore generated cpu headers 
-csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,9 +121,24 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: check-forbidden-imports
-    name: Check for forbidden imports
-    entry: python tools/pre_commit/check_forbidden_imports.py
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/pre_commit/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  # forbid directly import triton
+  - id: forbid-direct-triton-import
+    name: "Forbid direct 'import triton'"
+    entry: python tools/pre_commit/check_triton_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
@@ -132,22 +147,6 @@ repos:
    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
-  - id: validate-docker-versions
-    name: Validate docker/versions.json matches Dockerfile
-    entry: python tools/generate_versions_json.py --check
-    language: python
-    files: ^docker/(Dockerfile|versions\.json)$
-    pass_filenames: false
-    additional_dependencies: [dockerfile-parse]
-  - id: attention-backend-docs
-    name: Check attention backend documentation is up to date
-    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
-    language: python
-  - id: check-boolean-context-manager
-    name: Check for boolean ops in with-statements
-    entry: python tools/pre_commit/check_boolean_context_manager.py
-    language: python
-    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
TJian	f34eca5f01	[ROCm] [Bugfix] Fix torch sdpa hallucination (#30789 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> (cherry picked from commit `2410132bb1`)	2025-12-16 17:16:25 -08:00
Wentao Ye	4cd332f3cf	[CI] Skip ci failure test (#30804 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> (cherry picked from commit `b6ec077e05`)	2025-12-16 17:16:08 -08:00
Roger Wang	16484d394c	[Core][MM] Optimize encoder cache manager by operating with embeddings only (#30475 ) Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Sun Kim <sunytokki@gmail.com> (cherry picked from commit `f5f51e5931`)	2025-12-16 17:15:49 -08:00
Isotr0py	e397bd6592	[CI/Build] Skip broken ViT backend functionality test tempoarily (#30782 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit `4de08ad698`)	2025-12-16 17:15:26 -08:00
Isotr0py	6a88d590bb	[Bugfix] Fix broken ViT attention selection for Blackwell device (#30731 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit `e94384bbad`)	2025-12-16 17:13:54 -08:00
Shanshan Shen	ad8c073131	[CustomOp] Extract ApplyRotaryEmb as CustomOp and unify the dispatch logic (#29873 ) Signed-off-by: shen-shanshan <467638484@qq.com> Co-authored-by: gcanlin <canlinguosdu@gmail.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> (cherry picked from commit `3bd9c49158`)	2025-12-16 17:13:23 -08:00