[Bugfix] Disable TRTLLM attention when KV transfer is enabled (#33192 )

Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
[CI][torch.compile] Reduce e2e fusion test time (#33293 )
2026-02-05 00:49:18 +00:00 · 2026-02-04 19:09:03 -05:00 · 2026-02-05 08:05:13 +08:00 · 2026-02-04 23:40:22 +00:00 · 2026-02-04 17:02:46 -05:00 · 2026-02-04 21:59:59 +00:00
2858 changed files with 340298 additions and 120938 deletions
--- a/tests/compile/piecewise/init.py
+++ b/tests/compile/piecewise/init.py
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,25 @@
+name: vllm_ci
+job_dirs:
+  - ".buildkite/image_build"
+  - ".buildkite/test_areas"
+  - ".buildkite/hardware_tests"
+run_all_patterns:
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -0,0 +1,29 @@
+group: Hardware
+steps:
+  - label: "AMD: :docker: build image"
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+    - >
+      docker build
+      --build-arg max_jobs=16
+      --build-arg REMOTE_VLLM=1
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
+      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      -f docker/Dockerfile.rocm
+      --target test
+      --no-cache
+      --progress plain .
+    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+        - exit_status: 1  # Machine occasionally fail
+          limit: 1
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -0,0 +1,8 @@
+group: Hardware
+steps:
+  - label: "Arm CPU Test"
+    soft_fail: true
+    device: arm_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Ascend NPU Test"
+    soft_fail: true
+    timeout_in_minutes: 20
+    no_plugin: true
+    device: ascend_npu
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+steps:
+  - label: "GH200 Test"
+    soft_fail: true
+    device: gh200
+    no_plugin: true
+    optional: true
+    commands: 
+    - nvidia-smi 
+    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -0,0 +1,24 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Intel CPU Test"
+    soft_fail: true
+    device: intel_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
+  
+  - label: "Intel HPU Test"
+    soft_fail: true
+    device: intel_hpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
+
+  - label: "Intel GPU Test"
+    depends_on: []
+    soft_fail: true
+    device: intel_gpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+set -euo pipefail
+
+# replace invalid characters in Docker image tags and truncate to 128 chars
+clean_docker_tag() {
+    local input="$1"
+    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
+}
+
+print_usage_and_exit() {
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+    exit 1
+}
+
+print_instance_info() {
+    echo ""
+    echo "=== Debug: Instance Information ==="
+    # Get IMDSv2 token
+    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
+        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
+        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
+        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
+        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
+        echo "AMI ID:        ${AMI_ID}"
+        echo "Instance Type: ${INSTANCE_TYPE}"
+        echo "Instance ID:   ${INSTANCE_ID}"
+        echo "AZ:            ${AZ}"
+    else
+        echo "Not running on EC2 or IMDS not available"
+    fi
+    # Check for warm cache AMI (marker file baked into custom AMI)
+    if [[ -f /etc/vllm-ami-info ]]; then
+        echo "Cache:         warm (custom vLLM AMI)"
+        cat /etc/vllm-ami-info
+    else
+        echo "Cache:         cold (standard AMI)"
+    fi
+    echo "==================================="
+    echo ""
+}
+
+setup_buildx_builder() {
+    echo "--- :buildkite: Setting up buildx builder"
+    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
+        # Custom AMI with standalone buildkitd - use remote driver for warm cache
+        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
+        echo "Using remote driver to connect to buildkitd (warm cache available)"
+        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
+            echo "Using existing baked-vllm-builder"
+            docker buildx use baked-vllm-builder
+        else
+            echo "Creating baked-vllm-builder with remote driver"
+            docker buildx create \
+                --name baked-vllm-builder \
+                --driver remote \
+                --use \
+                "unix://${BUILDKIT_SOCKET}"
+        fi
+        docker buildx inspect --bootstrap
+    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+        # Existing builder available
+        echo "Using existing builder: ${BUILDER_NAME}"
+        docker buildx use "${BUILDER_NAME}"
+        docker buildx inspect --bootstrap
+    else
+        # No local buildkitd, no existing builder - create new docker-container builder
+        echo "No local buildkitd found, using docker-container driver"
+        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
+        docker buildx inspect --bootstrap
+    fi
+
+    # builder info
+    echo "Active builder:"
+    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
+}
+
+check_and_skip_if_image_exists() {
+    if [[ -n "${IMAGE_TAG:-}" ]]; then
+        echo "--- :mag: Checking if image exists"
+        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
+            echo "Image already exists: ${IMAGE_TAG}"
+            echo "Skipping build"
+            exit 0
+        fi
+        echo "Image not found, proceeding with build"
+    fi
+}
+
+ecr_login() {
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+}
+
+prepare_cache_tags() {
+    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
+    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
+    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
+
+    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+            cache="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
+            cache="${TEST_CACHE_ECR}:${clean_branch}"
+        fi
+        CACHE_TO="$cache"
+        CACHE_FROM="$cache"
+        CACHE_FROM_BASE_BRANCH="$cache"
+    else
+        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
+            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
+            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
+        fi
+    fi
+
+    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
+    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
+}
+
+resolve_parent_commit() {
+    if [[ -z "${PARENT_COMMIT:-}" ]]; then
+        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
+        if [[ -n "${PARENT_COMMIT}" ]]; then
+            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
+            export PARENT_COMMIT
+        else
+            echo "Could not determine parent commit (may be first commit in repo)"
+        fi
+    else
+        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
+    fi
+}
+
+print_bake_config() {
+    echo "--- :page_facing_up: Resolved bake configuration"
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
+    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
+    echo "--- :arrow_down: Uploading bake config to Buildkite"
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+}
+
+#################################
+#         Main Script           #
+#################################
+print_instance_info
+
+if [[ $# -lt 7 ]]; then
+    print_usage_and_exit
+fi
+
+# input args
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+IMAGE_TAG=$7
+IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
+
+# build config
+TARGET="test-ci"
+VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
+BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
+CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
+CI_HCL_PATH="/tmp/ci.hcl"
+BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
+
+prepare_cache_tags
+ecr_login
+
+# Environment info (for docs and human readers)
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
+#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+#
+# Build configuration (exported as environment variables for bake):
+export BUILDKITE_COMMIT
+export PARENT_COMMIT
+export IMAGE_TAG
+export IMAGE_TAG_LATEST
+export CACHE_FROM
+export CACHE_FROM_BASE_BRANCH
+export CACHE_FROM_MAIN
+export CACHE_TO
+export VLLM_USE_PRECOMPILED
+export VLLM_MERGE_BASE_COMMIT
+
+# print args
+echo "--- :mag: Arguments"
+echo "REGISTRY: ${REGISTRY}"
+echo "REPO: ${REPO}"
+echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
+echo "BRANCH: ${BRANCH}"
+echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
+echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
+echo "IMAGE_TAG: ${IMAGE_TAG}"
+echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
+
+# print build configuration
+echo "--- :mag: Build configuration"
+echo "TARGET: ${TARGET}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
+echo "BUILDER_NAME: ${BUILDER_NAME}"
+echo "CI_HCL_URL: ${CI_HCL_URL}"
+echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
+
+echo "--- :mag: Cache tags"
+echo "CACHE_TO: ${CACHE_TO}"
+echo "CACHE_FROM: ${CACHE_FROM}"
+echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
+echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
+
+check_and_skip_if_image_exists
+
+echo "--- :docker: Setting up Docker buildx bake"
+echo "Target: ${TARGET}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
+echo "CI HCL path: ${CI_HCL_PATH}"
+
+if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
+    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
+    echo "Make sure you're running from the vLLM repository root"
+    exit 1
+fi
+
+echo "--- :arrow_down: Downloading ci.hcl"
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
+echo "Downloaded to ${CI_HCL_PATH}"
+
+if [[ ! -f "${CI_HCL_PATH}" ]]; then
+    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
+    exit 1
+fi
+
+setup_buildx_builder
+
+resolve_parent_commit
+export PARENT_COMMIT
+
+print_bake_config
+
+echo "--- :docker: Building ${TARGET}"
+docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
+
+echo "--- :white_check_mark: Build complete"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,58 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    depends_on: []
+    commands:
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    depends_on: []
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: cpu-arm64-image-build
+    depends_on: []
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -1,12 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.595
+    value: 0.419
  - name: "exact_match,flexible-extract"
-    value: 0.582
+    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
+limit: 100
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
+rtol: 0.05
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -0,0 +1,15 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.695
+  - name: "exact_match,flexible-extract"
+    value: 0.447
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -0,0 +1,19 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.7142
+  - name: "exact_match,flexible-extract"
+    value: 0.4579
+env_vars:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+kv_cache_dtype: fp8
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -0,0 +1,14 @@
+model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+tasks:
+  - name: "mmlu_pro"
+    metrics:
+      - name: "exact_match,custom-extract"
+        value: 0.82
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
+enforce_eager: false # we use false to speed up the eval process
+kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
+max_model_len: 40960
+apply_chat_template: true
+fewshot_as_multiturn: true
+gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -0,0 +1,2 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -0,0 +1,5 @@
+Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install "lm-eval[api]>=0.4.9.2"
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install "lm-eval[api]>=0.4.9.2"
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,32 +9,79 @@ pytest -s -v test_lm_eval_correctness.py \
    --tp-size=1
 """

+import os
+from contextlib import contextmanager
+
 import lm_eval
 import numpy as np
 import yaml

-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+    if not new_env:
+        # Fast path: nothing to do
+        yield
+        return
+
+    old_values = {}
+    new_keys = []
+
+    try:
+        for key, value in new_env.items():
+            if key in os.environ:
+                old_values[key] = os.environ[key]
+            else:
+                new_keys.append(key)
+            os.environ[key] = str(value)
+        yield
+    finally:
+        # Restore / clean up
+        for key, value in old_values.items():
+            os.environ[key] = value
+        for key in new_keys:
+            os.environ.pop(key, None)


 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
+    enforce_eager = eval_config.get("enforce_eager", "true")
+    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager=true,"
+        f"enforce_eager={enforce_eager},"
+        f"kv_cache_dtype={kv_cache_dtype},"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
-    )
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        batch_size="auto",
+        f"max_model_len={max_model_len},"
+        "allow_deprecated_quantization=True,"
    )
+
+    env_vars = eval_config.get("env_vars", None)
+    with scoped_env_vars(env_vars):
+        results = lm_eval.simple_evaluate(
+            model=backend,
+            model_args=model_args,
+            tasks=[task["name"] for task in eval_config["tasks"]],
+            num_fewshot=eval_config["num_fewshot"],
+            limit=eval_config["limit"],
+            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+            # text models. however, this is regressing measured strict-match for
+            # existing text models in CI, so only apply it for mm, or explicitly set
+            apply_chat_template=eval_config.get(
+                "apply_chat_template", backend == "vllm-vlm"
+            ),
+            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+            gen_kwargs=eval_config.get("gen_kwargs"),
+            batch_size=batch_size,
+        )
    return results


@@ -43,6 +90,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):

    results = launch_lm_eval(eval_config, tp_size)

+    rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@@ -50,8 +99,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth} | measured={measured_value}"
+                f"ground_truth={ground_truth:.3f} | "
+                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)

    assert success
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,184 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    key: wait-for-container-image
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - label: "Cleanup H100"
-    agents:
-      queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
-  
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  # Premerge benchmark
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,28 +0,0 @@
-# Nightly benchmark annotation
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-## Results reproduction
-
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
-    - Download `nightly-benchmarks.zip`.
-    - In the same folder, run the following code:
-
-    ```bash
-    export HF_TOKEN=<your HF token>
-    apt update
-    apt install -y git
-    unzip nightly-benchmarks.zip
-    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-    ```
-
-And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -1,39 +0,0 @@
-
-# Nightly benchmark
-
-This benchmark aims to:
-
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
-
-## Setup
-
- Docker images:
-    - vLLM: `vllm/vllm-openai:v0.6.2`
-    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
-    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
-    - 8x Nvidia A100 GPUs
- Workload:
-    - Dataset
-        - ShareGPT dataset
-        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-    - Models: llama-3 8B, llama-3 70B.
-        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-## Known issues
-
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,196 +0,0 @@
-common_pod_spec: &common_pod_spec
-  priorityClassName: perf-benchmark
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-    - name: devshm
-      emptyDir:
-        medium: Memory
-    - name: hf-cache
-      hostPath:
-        path: /root/.cache/huggingface
-        type: Directory
-
-common_container_settings: &common_container_settings
-  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  resources:
-    limits:
-      nvidia.com/gpu: 8
-  volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-  env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-
-steps:
-  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
-                <<: *common_container_settings
-
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
-                <<: *common_container_settings
-
-
-
-
-  - label: "A100 trt llama-8B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"
-
-
-  - label: "A100 trt llama-70B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
-        
-  - wait
-
-  - label: "Collect the results"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-            - image: vllm/vllm-openai:v0.5.0.post1
-              command:
-              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-              resources:
-                limits:
-                  nvidia.com/gpu: 8
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: VLLM_SOURCE_CODE_LOC
-                value: /workspace/build/buildkite/vllm/performance-benchmark
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-
-  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,307 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import json
-import os
-from importlib import util
-
-import pandas as pd
-
-plotly_found = util.find_spec("plotly.express") is not None
-
-
-def compare_data_columns(
-    files, name_column, data_column, info_cols, drop_column, debug=False
-):
-    """
-    Align concatenation by keys derived from info_cols instead of row order.
-    - Pick one canonical key list: subset of info_cols present in ALL files.
-    - For each file: set index to those keys, aggregate duplicates
-    - (mean for metric, first for names).
-    - Concat along axis=1 (indexes align), then reset_index so callers can
-    - group by columns.
-    - If --debug, add a <file_label>_name column per file.
-    """
-    print("\ncompare_data_column:", data_column)
-
-    frames = []
-    raw_data_cols = []
-    compare_frames = []
-
-    # 1) choose a canonical key list from info_cols that exists in ALL files
-    cols_per_file = []
-    for f in files:
-        try:
-            df_tmp = pd.read_json(f, orient="records")
-        except Exception as err:
-            raise ValueError(f"Failed to read {f}") from err
-        cols_per_file.append(set(df_tmp.columns))
-
-    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
-    if not key_cols:
-        # soft fallback: use any info_cols present in the first file
-        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
-    if not key_cols:
-        raise ValueError(
-            "No common key columns found from info_cols across the input files."
-        )
-
-    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
-    meta_added = False
-
-    for file in files:
-        df = pd.read_json(file, orient="records")
-
-        # Keep rows that actually have the compared metric (same as original behavior)
-        if drop_column in df.columns:
-            df = df.dropna(subset=[drop_column], ignore_index=True)
-
-        # Stabilize numeric key columns (harmless if missing)
-        for c in (
-            "Input Len",
-            "Output Len",
-            "TP Size",
-            "PP Size",
-            "# of max concurrency.",
-            "qps",
-        ):
-            if c in df.columns:
-                df[c] = pd.to_numeric(df[c], errors="coerce")
-
-        # Ensure all key columns exist
-        for c in key_cols:
-            if c not in df.columns:
-                df[c] = pd.NA
-
-        # Set index = key_cols and aggregate duplicates → unique MultiIndex
-        df_idx = df.set_index(key_cols, drop=False)
-
-        # meta (key columns), unique per key
-        meta = df_idx[key_cols]
-        if not meta.index.is_unique:
-            meta = meta.groupby(level=key_cols, dropna=False).first()
-
-        # metric series for this file, aggregated to one row per key
-        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
-        if not s.index.is_unique:
-            s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label  # column label like original
-
-        # add meta once (from first file) so keys are the leftmost columns
-        if not meta_added:
-            frames.append(meta)
-            meta_added = True
-
-        # (NEW) debug: aligned test-name column per file
-        if debug and name_column in df_idx.columns:
-            name_s = df_idx[name_column]
-            if not name_s.index.is_unique:
-                name_s = name_s.groupby(level=key_cols, dropna=False).first()
-            name_s.name = f"{file_label}_name"
-            frames.append(name_s)
-
-        frames.append(s)
-        raw_data_cols.append(file_label)
-        compare_frames.append(s)
-
-        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
-        if len(compare_frames) >= 2:
-            base = compare_frames[0]
-            current = compare_frames[-1]
-            ratio = current / base
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
-            frames.append(ratio)
-
-    # 4) concat on columns with aligned MultiIndex;
-    # then reset_index to return keys as columns
-    concat_df = pd.concat(frames, axis=1)
-    concat_df = concat_df.reset_index(drop=True).reset_index()
-    if "index" in concat_df.columns:
-        concat_df = concat_df.drop(columns=["index"])
-
-    # Ensure key/info columns appear first (in your info_cols order)
-    front = [c for c in info_cols if c in concat_df.columns]
-    rest = [c for c in concat_df.columns if c not in front]
-    concat_df = concat_df[front + rest]
-
-    print(raw_data_cols)
-    return concat_df, raw_data_cols
-
-
-def split_json_by_tp_pp(
-    input_file: str = "benchmark_results.json", output_root: str = "."
-) -> list[str]:
-    """
-    Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
-    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
-    Returns: list of file paths written.
-    """
-    # Load JSON data into DataFrame
-    with open(input_file, encoding="utf-8") as f:
-        data = json.load(f)
-
-    # If the JSON is a dict with a list under common keys, use that list
-    if isinstance(data, dict):
-        for key in ("results", "serving_results", "benchmarks", "data"):
-            if isinstance(data.get(key), list):
-                data = data[key]
-                break
-
-    df = pd.DataFrame(data)
-
-    # Keep only "serving" tests
-    name_col = next(
-        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
-    )
-    if name_col:
-        df = df[
-            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
-        ].copy()
-
-    # Handle alias column names
-    rename_map = {
-        "tp_size": "TP Size",
-        "tensor_parallel_size": "TP Size",
-        "pp_size": "PP Size",
-        "pipeline_parallel_size": "PP Size",
-    }
-    df.rename(
-        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
-    )
-
-    # Ensure TP/PP columns exist (default to 1 if missing)
-    if "TP Size" not in df.columns:
-        df["TP Size"] = 1
-    if "PP Size" not in df.columns:
-        df["PP Size"] = 1
-
-    # make sure TP/PP are numeric ints with no NaN
-    df["TP Size"] = (
-        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-    df["PP Size"] = (
-        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-
-    # Split into separate folders
-    saved_paths: list[str] = []
-    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
-        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
-        os.makedirs(folder_name, exist_ok=True)
-        filepath = os.path.join(folder_name, "benchmark_results.json")
-        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
-        print(f"Saved: {filepath}")
-        saved_paths.append(filepath)
-
-    return saved_paths
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-f", "--file", action="append", type=str, help="input file name"
-    )
-    parser.add_argument(
-        "--debug", action="store_true", help="show all information for debugging"
-    )
-    parser.add_argument(
-        "--plot",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="plot perf diagrams or not --no-plot --plot",
-    )
-    parser.add_argument(
-        "-x",
-        "--xaxis",
-        type=str,
-        default="# of max concurrency.",
-        help="column name to use as X Axis in comparison graph",
-    )
-    args = parser.parse_args()
-
-    drop_column = "P99"
-    name_column = "Test name"
-    info_cols = [
-        "Model",
-        "Dataset Name",
-        "Input Len",
-        "Output Len",
-        "TP Size",
-        "PP Size",
-        "# of max concurrency.",
-        "qps",
-    ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-    html_msgs_for_data_cols = [
-        "Compare Output Tokens /n",
-        "Median TTFT /n",
-        "Median TPOT /n",
-    ]
-
-    if len(args.file) == 1:
-        files = split_json_by_tp_pp(args.file[0], output_root="splits")
-        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
-    else:
-        files = args.file
-    print("comparing : " + ", ".join(files))
-    debug = args.debug
-    plot = args.plot
-    # For Plot feature, assign y axis from one of info_cols
-    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "w") as text_file:
-        for i in range(len(data_cols_to_compare)):
-            output_df, raw_data_cols = compare_data_columns(
-                files,
-                name_column,
-                data_cols_to_compare[i],
-                info_cols,
-                drop_column,
-                debug=debug,
-            )
-
-            # For Plot feature, insert y axis from one of info_cols
-            raw_data_cols.insert(0, info_cols[y_axis_index])
-
-            filtered_info_cols = info_cols[:-2]
-            existing_group_cols = [
-                c for c in filtered_info_cols if c in output_df.columns
-            ]
-            if not existing_group_cols:
-                raise ValueError(
-                    f"No valid group-by columns  "
-                    f"Expected subset: {filtered_info_cols}, "
-                    f"but DataFrame has: {list(output_df.columns)}"
-                )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
-            for name, group in output_groups:
-                html = group.to_html()
-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-
-                if plot and plotly_found:
-                    import plotly.express as px
-
-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
-                    )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
-                    )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-
-from transformers import AutoTokenizer
-
-
-def main(model, cachedir):
-    # Load the tokenizer and save it to the specified directory
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    tokenizer.save_pretrained(cachedir)
-    print(f"Tokenizer saved to {cachedir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
-    )
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-    parser.add_argument(
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
-    )
-
-    args = parser.parse_args()
-    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
-    )
-    parser.add_argument(
-        "--results-folder",
-        type=str,
-        required=True,
-        help="The folder where the results are stored.",
-    )
-    parser.add_argument(
-        "--description", type=str, required=True, help="Description of the results."
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.0)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
-        )
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file) as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
-
-    with open(args.description) as f:
-        description = f.read()
-
-    description = description.format(nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-
-main() {
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip plotting the results."
-        exit 0
-    fi
-
-    # initial annotation
-    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
-
-    # download results
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
-    ls
-    ls results/
-
-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd "$VLLM_SOURCE_CODE_LOC/"
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
-    
-
-
-    # The figures should be generated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -1,464 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f '[p]ython'
-  pkill -f '[p]ython3'
-  pkill -f '[t]ritonserver'
-  pkill -f '[p]t_main_thread'
-  pkill -f '[t]ext-generation'
-  pkill -f '[l]mdeploy'
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pkill -f '[V]LLM'
-
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which "$cmd" >/dev/null; then
-    apt-get update && apt-get install -y "$cmd"
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-run_genai_perf_tests() {
-  # run genai-perf tests
-
-  # $1: a json file specifying genai-perf test cases
-  local genai_perf_test_file
-  genai_perf_test_file=$1
-
-  # Iterate over genai-perf tests
-  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps=$num_prompts
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-      #TODO: add output dir.
-      client_command="genai-perf profile \
-        -m $model \
-        --service-kind openai \
-        --backend "$backend" \
-        --endpoint-type chat \
-        --streaming \
-        --url localhost:$port \
-        --request-rate $qps \
-        --num-prompts $num_prompts \
-      "
-
-    echo "Client command: $client_command"
-
-    eval "$client_command"
-
-    #TODO: process/record outputs
-    done
-  done
-
-  kill_gpu_processes
-
-}
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  pip install -r requirements/dev.txt
-  which genai-perf
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-  # genai-perf dependency
-  ensure_installed libb64-0d
-
-  prepare_dataset
-
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-
-  # run the test
-  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
-
-  # run genai-perf tests
-  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
-  mv artifacts/ $RESULTS_FOLDER/
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import datetime
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "completed": "Successful req.",
-    "request_throughput": "Tput (req/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
-    "engine": "Engine",
-}
-
-if __name__ == "__main__":
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-        with open(test_file) as f:
-            raw_result = json.loads(f.read())
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands")) as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    serving_results = pd.DataFrame.from_dict(serving_results)
-
-    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            columns=serving_column_mapping
-        )
-
-    serving_md_table_with_headers = tabulate(
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
-
-    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
-
-    # document benchmarking results in markdown
-    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        # document results with header.
-        # for those who wants to reproduce our benchmark.
-        f.write(serving_md_table_with_headers)
-        f.write("\n")
-
-    # document benchmarking results in json
-    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,23 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-else
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-fi
-
-TIMEOUT_SECONDS=10
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -1,30 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,610 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,820 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -1,168 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 1024,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 100
-        }
-    },
-    {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 1024,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 100
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -1,32 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -2,60 +2,41 @@

 ## Introduction

-This directory contains two sets of benchmark for vllm.
-
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-
-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
+vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.

 **Benchmarking Duration**: about 1hr.

 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.

-## Nightly benchmark quick overview
-
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
-
-**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-
-**Benchmarking Duration**: about 3.5hrs.
-
 ## Trigger the benchmark

-Performance benchmark will be triggered when:
-
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
-
-Manually Trigger the benchmark
+The benchmark needs to be triggered manually:

 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```

 Runtime environment variables:

- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

-Nightly benchmark will be triggered when:
-
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
-
 ## Performance benchmark details

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test

 Here is an example of one test inside `latency-tests.json`:
@@ -128,6 +109,65 @@ The number of this test is less stable compared to the delay and latency benchma

 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.

+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+<details>
+<summary> An Example of default parameters field </summary>
+
+```json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "server_environment_variables": {
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+    },
+    "server_parameters": {
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "block_size": 128,
+      "disable_log_stats": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "backend": "vllm",
+      "dataset_name": "random",
+      "random-input-len": 128,
+      "random-output-len": 128,
+      "num_prompts": 200,
+      "ignore-eos": ""
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama3B_tp2_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 2,
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+      }
+    },
+    {
+      "test_name": "serving_qwen3_tp4_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-14B",
+        "tensor_parallel_size": 4,
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-14B",
+      }
+    },
+  ]
+}
+```
+
+</details>
+
 ### Visualizing the results

 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
@@ -136,42 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

-The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
-When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+#### Performance Results Comparison  

-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-
-|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
-
-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
-
-## Nightly test details
-
-See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
-### Workflow
-
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-
-### Nightly tests
-
-In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-
-### Docker containers
-
-The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
-
-WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
+Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
@@ -5,7 +5,7 @@
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

@@ -16,7 +16,7 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.

@@ -28,7 +28,7 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -0,0 +1,825 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import html as _html
+import json
+import os
+from dataclasses import dataclass
+from importlib import util
+
+import pandas as pd
+
+pd.options.display.float_format = "{:.2f}".format
+plotly_found = util.find_spec("plotly.express") is not None
+
+DEFAULT_INFO_COLS = [
+    "Model",
+    "Dataset Name",
+    "Input Len",
+    "Output Len",
+    #    "TP Size",
+    #    "PP Size",
+    "# of max concurrency.",
+    "qps",
+]
+
+# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
+pd.set_option("display.precision", 2)
+pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+
+# -----------------------------
+# Core data compare
+# -----------------------------
+def compare_data_columns(
+    files: list[str],
+    name_column: str,
+    data_column: str,
+    info_cols: list[str],
+    drop_column: str,
+    debug: bool = False,
+):
+    """
+    Align concatenation by keys derived from info_cols instead of row order.
+    - Pick one canonical key list: subset of info_cols present in ALL files.
+    - For each file: set index to those keys, aggregate duplicates
+      (mean for metric, first for names).
+    - Concat along axis=1 (indexes align), then reset_index so callers can
+      group by columns.
+    - If --debug, add a <file_label>_name column per file.
+    """
+    print("\ncompare_data_column:", data_column)
+
+    frames = []
+    raw_data_cols: list[str] = []
+    compare_frames = []
+
+    cols_per_file: list[set] = []
+    for f in files:
+        try:
+            df_tmp = pd.read_json(f, orient="records")
+        except Exception as err:
+            raise ValueError(f"Failed to read {f}") from err
+        cols_per_file.append(set(df_tmp.columns))
+
+    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+    if not key_cols:
+        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+    if not key_cols:
+        raise ValueError(
+            "No common key columns found from info_cols across the input files."
+        )
+
+    meta_added = False
+
+    for file in files:
+        df = pd.read_json(file, orient="records")
+
+        if drop_column in df.columns:
+            df = df.dropna(subset=[drop_column], ignore_index=True)
+
+        for c in (
+            "Input Len",
+            "Output Len",
+            "TP Size",
+            "PP Size",
+            "# of max concurrency.",
+            "qps",
+        ):
+            if c in df.columns:
+                df[c] = pd.to_numeric(df[c], errors="coerce")
+
+        for c in key_cols:
+            if c not in df.columns:
+                df[c] = pd.NA
+
+        df_idx = df.set_index(key_cols, drop=False)
+
+        meta = df_idx[key_cols]
+        if not meta.index.is_unique:
+            meta = meta.groupby(level=key_cols, dropna=False).first()
+
+        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
+        s.name = file_label
+
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
+        if debug and name_column in df_idx.columns:
+            name_s = df_idx[name_column]
+            if not name_s.index.is_unique:
+                name_s = name_s.groupby(level=key_cols, dropna=False).first()
+            name_s.name = f"{file_label}_name"
+            frames.append(name_s)
+
+        frames.append(s)
+        raw_data_cols.append(file_label)
+        compare_frames.append(s)
+
+        if len(compare_frames) >= 2:
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            if "P99" in data_column or "Median" in data_column:
+                ratio = base / current
+            else:
+                ratio = current / base
+            ratio = ratio.mask(base == 0)
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            frames.append(ratio)
+
+    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
+
+    front = [c for c in info_cols if c in concat_df.columns]
+    rest = [c for c in concat_df.columns if c not in front]
+    concat_df = concat_df[front + rest]
+
+    print(raw_data_cols)
+    return concat_df, raw_data_cols
+
+
+# -----------------------------
+# Split helper
+# -----------------------------
+def split_json_by_tp_pp(
+    input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+    with open(input_file, encoding="utf-8") as f:
+        data = json.load(f)
+
+    if isinstance(data, dict):
+        for key in ("results", "serving_results", "benchmarks", "data"):
+            if isinstance(data.get(key), list):
+                data = data[key]
+                break
+
+    df = pd.DataFrame(data)
+
+    name_col = next(
+        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+    )
+    if name_col:
+        df = df[
+            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+        ].copy()
+
+    rename_map = {
+        "tp_size": "TP Size",
+        "tensor_parallel_size": "TP Size",
+        "pp_size": "PP Size",
+        "pipeline_parallel_size": "PP Size",
+    }
+    df.rename(
+        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+    )
+
+    if "TP Size" not in df.columns:
+        df["TP Size"] = 1
+    if "PP Size" not in df.columns:
+        df["PP Size"] = 1
+
+    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
+
+    saved_paths: list[str] = []
+    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+        os.makedirs(folder_name, exist_ok=True)
+        filepath = os.path.join(folder_name, "benchmark_results.json")
+        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+        print(f"Saved: {filepath}")
+        saved_paths.append(filepath)
+
+    return saved_paths
+
+
+# -----------------------------
+# Styling helpers
+# -----------------------------
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
+def _highlight_threshold(
+    df: pd.DataFrame, threshold: float
+) -> pd.io.formats.style.Styler:
+    conc_col = _find_concurrency_col(df)
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
+        if c in df.columns
+    ]
+    conf_cols = [
+        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
+    ]
+    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )
+
+
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+    if not ratio_cols:
+        return styler
+
+    styler = styler.apply(
+        lambda _: ["background-color: #fff3b0"] * len(styler.data),
+        subset=ratio_cols,
+        axis=0,
+    )
+
+    styler = styler.set_table_styles(
+        [
+            {
+                "selector": f"th.col_heading.level0.col{i}",
+                "props": [("background-color", "#fff3b0")],
+            }
+            for i, col in enumerate(styler.data.columns)
+            if col in ratio_cols
+        ],
+        overwrite=False,
+    )
+    return styler
+
+
+def _apply_two_decimals(
+    styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+    df = styler.data
+    num_cols = df.select_dtypes("number").columns
+    if len(num_cols) == 0:
+        return styler
+    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+        if c in df.columns
+    ]
+    exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+    cols: list[str] = []
+    for c in df.columns:
+        if c in exclude:
+            continue
+        lc = str(c).lower()
+        if lc.startswith("ratio"):
+            continue
+        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+            continue
+        if pd.api.types.is_numeric_dtype(df[c]):
+            cols.append(c)
+    return cols
+
+
+def _max_concurrency_ok(
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+    d = d.dropna(subset=[conc_col, cfg_col])
+
+    if d.empty:
+        return pd.NA
+
+    ok = d[d[cfg_col] <= threshold]
+    if ok.empty:
+        return pd.NA
+
+    return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+    if (
+        df is None
+        or conc_col not in df.columns
+        or cfg_col not in df.columns
+        or pd.isna(conc_value)
+    ):
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+    conc_value = pd.to_numeric(conc_value, errors="coerce")
+    if pd.isna(conc_value):
+        return pd.NA
+
+    hit = d[d[conc_col] == conc_value]
+    if hit.empty:
+        return pd.NA
+    return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> str:
+    if ttft_group_df is None and tpot_group_df is None:
+        return ""
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    summary_df = pd.DataFrame(rows)
+
+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+    both_col = f"Max {conc_col} (Both)"
+
+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+    formatters = {}
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        # default argument binds per-column formatter correctly
+        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+    styler = summary_df.style.format(formatters)
+
+    def _green(v):
+        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+    if both_col in summary_df.columns:
+        styler = styler.map(_green, subset=[both_col])
+
+    title = (
+        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
+        "Valid Max Concurrency Summary"
+        "</div>\n"
+    )
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    if plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash",
+                    color="red" if "ttft" in label.lower() else "blue",
+                ),
+                name=label,
+            )
+        )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+    data_cols: list[str]
+    drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-f", "--file", action="append", type=str, help="input file name"
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="show all information for debugging"
+    )
+    parser.add_argument(
+        "--plot",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="plot perf diagrams or not --no-plot --plot",
+    )
+    parser.add_argument(
+        "-x",
+        "--xaxis",
+        type=str,
+        default="# of max concurrency.",
+        help="column name to use as X Axis in comparison graph",
+    )
+    parser.add_argument(
+        "-l",
+        "--latency",
+        type=str,
+        default="p99",
+        help="take median|p99 for latency like TTFT/TPOT",
+    )
+    parser.add_argument(
+        "--ttft-max-ms",
+        type=float,
+        default=3000.0,
+        help="Reference limit for TTFT plots (ms)",
+    )
+    parser.add_argument(
+        "--tpot-max-ms",
+        type=float,
+        default=100.0,
+        help="Reference limit for TPOT plots (ms)",
+    )
+    return parser
+
+
+def choose_metrics(latency: str) -> MetricPlan:
+    latency = (latency or "").lower()
+    drop_column = "P99"
+
+    if "median" in latency:
+        return MetricPlan(
+            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+            drop_column=drop_column,
+        )
+
+    return MetricPlan(
+        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+        drop_column=drop_column,
+    )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+    if not args.file:
+        raise ValueError("No input files provided. Use -f/--file.")
+
+    if len(args.file) == 1:
+        files = split_json_by_tp_pp(args.file[0], output_root="splits")
+        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+    else:
+        files = args.file
+
+    return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+    return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+    filtered_info_cols = info_cols[:4]
+    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+    if not group_cols:
+        raise ValueError(
+            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+            f"but DataFrame has: {list(output_df.columns)}"
+        )
+    return group_cols
+
+
+def normalize_group_key(name):
+    return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+    name_vals = normalize_group_key(name)
+    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+    return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+    name_vals = normalize_group_key(name)
+    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+    display_group: pd.DataFrame,
+    metric_label: str,
+    group_suffix: str,
+    args,
+) -> str:
+    title = (
+        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
+        f"{_html.escape(metric_label)}"
+        f" — {_html.escape(group_suffix)}"
+        f"</div>\n"
+    )
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+    else:
+        styler = display_group.style
+
+    styler = _apply_two_decimals(styler)
+    styler = highlight_ratio_columns(styler)
+
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+    main_fh,
+    sub_fh,
+    group_df: pd.DataFrame,
+    raw_data_cols: list[str],
+    metric_label: str,
+    y_axis_col: str,
+    args,
+):
+    if not (args.plot and plotly_found):
+        return
+
+    import plotly.express as px
+
+    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+    df_melted = df.melt(
+        id_vars=y_axis_col,
+        var_name="Configuration",
+        value_name=metric_label,
+    )
+
+    fig = px.line(
+        df_melted,
+        x=y_axis_col,
+        y=metric_label,
+        color="Configuration",
+        title=f"{metric_label} vs {y_axis_col}",
+        markers=True,
+    )
+
+    # Ensure plot hover + y tick labels are also 2 decimals.
+    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
+    fig.update_yaxes(tickformat=".2f")
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+    main_fh.write(html)
+    sub_fh.write(html)
+
+
+def build_group_keys(
+    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+    if sort_cols:
+        df = df.sort_values(by=sort_cols)
+    gb = df.groupby(group_cols, dropna=False)
+    return [k for k, _ in gb]
+
+
+def write_report_group_first(
+    files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+    name_column = "Test name"
+    y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
+    print("comparing : " + ", ".join(files))
+
+    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+    group_cols_canonical: list[str] | None = None
+
+    for metric_label in plan.data_cols:
+        output_df, raw_data_cols = compare_data_columns(
+            files,
+            name_column,
+            metric_label,
+            info_cols,
+            plan.drop_column,
+            debug=args.debug,
+        )
+
+        raw_data_cols = list(raw_data_cols)
+        raw_data_cols.insert(0, y_axis_col)
+
+        group_cols = get_group_cols(output_df, info_cols)
+        if group_cols_canonical is None:
+            group_cols_canonical = group_cols
+        else:
+            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+        metric_cache[metric_label] = (
+            output_df.sort_values(by=args.xaxis),
+            raw_data_cols,
+        )
+
+    if not group_cols_canonical:
+        raise ValueError("No canonical group columns found across metrics.")
+
+    first_metric = plan.data_cols[0]
+    first_df_sorted, _ = metric_cache[first_metric]
+    group_keys = build_group_keys(
+        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+    )
+
+    metric_groupbys = {
+        metric_label: df.groupby(group_cols_canonical, dropna=False)
+        for metric_label, (df, _) in metric_cache.items()
+    }
+
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
+            )
+
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis
+
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
+                    )
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
+                    )
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
+                    )
+
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
+                )
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)
+
+
+def main():
+    args = build_parser().parse_args()
+    info_cols = list(DEFAULT_INFO_COLS)
+    plan = choose_metrics(args.latency)
+    files, info_cols = prepare_input_files(args, info_cols)
+    write_report_group_first(files, info_cols, plan, args)
+
+
+if __name__ == "__main__":
+    main()
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -63,9 +63,11 @@ serving_column_mapping = {
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
+    "std_ttft_ms": "STD TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
+    "std_tpot_ms": "STD TPOT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@@ -368,7 +370,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
        )

    # get markdown tables
@@ -390,8 +392,8 @@ if __name__ == "__main__":
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/"
-            + "performance-benchmarks-descriptions.md"
+            "../.buildkite/performance-benchmarks/"
+            "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -15,6 +15,8 @@ check_gpus() {
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  elif command -v hl-smi; then
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -23,10 +25,16 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
+
+  declare -g arch_suffix=''
+
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  elif command -v hl-smi; then
+    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
+    arch_suffix='-hpu'
  fi
  echo "GPU type is $gpu_type"
 }
@@ -41,7 +49,11 @@ check_cpus() {
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
  echo "GPU type is $gpu_type"
 }

@@ -102,7 +114,8 @@ json2envs() {
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
+  local timeout_val="1200"
+  timeout "$timeout_val" bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
@@ -138,6 +151,10 @@ kill_gpu_processes() {
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
+  elif command -v hl-smi; then
+    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
+      sleep 1
+    done
  fi

  # remove vllm config file
@@ -164,19 +181,20 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-run_latency_tests() {
-  # run latency tests using `vllm bench latency` command
-  # $1: a json file specifying latency test cases
+run_benchmark_tests() {
+  # run benchmark tests using `vllm bench <test_type>` command
+  # $1: test type (latency or throughput)
+  # $2: a json file specifying test cases

-  local latency_test_file
-  latency_test_file=$1
+  local test_type=$1
+  local test_file=$2

-  # Iterate over latency tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
+  # Iterate over tests
+  jq -c '.[]' "$test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
+    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
+      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
      exit 1
    fi

@@ -187,15 +205,15 @@ run_latency_tests() {
    fi

    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
-    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    latency_envs=$(json2envs "$latency_environment_variables")
+    bench_params=$(echo "$params" | jq -r '.parameters')
+    bench_args=$(json2args "$bench_params")
+    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    bench_envs=$(json2envs "$bench_environment_variables")

    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -208,108 +226,85 @@ run_latency_tests() {
      fi
    fi

-    latency_command=" $latency_envs vllm bench latency \
+    bench_command=" $bench_envs vllm bench $test_type \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
+      $bench_args"

    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
+    echo "${test_type^} command: $bench_command"

-    # recoding benchmarking command ang GPU command
+    # recording benchmarking command and GPU command
    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
+      --arg command "$bench_command" \
      --arg gpu "$gpu_type" \
+      --arg test_type "$test_type" \
      '{
-        latency_command: $latency,
+        ($test_type + "_command"): $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
-    eval "$latency_command"
+    eval "$bench_command"

    kill_gpu_processes

  done
 }

+run_latency_tests() {
+  run_benchmark_tests "latency" "$1"
+}
+
+run_startup_tests() {
+  run_benchmark_tests "startup" "$1"
+}
+
 run_throughput_tests() {
-  # run throughput tests using `vllm bench throughput`
-  # $1: a json file specifying throughput test cases
-
-  local throughput_test_file
-  throughput_test_file=$1
-
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    throughput_envs=$(json2envs "$throughput_environment_variables")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-
-    throughput_command=" $throughput_envs vllm bench throughput \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$throughput_command"
-
-    kill_gpu_processes
-
-  done
+  run_benchmark_tests "throughput" "$1"
 }

 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }

  local serving_test_file
  serving_test_file=$1

  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
+  jq -c '
+    if type == "array" then
+      # Plain format: test cases array
+      .[]
+    elif (type == "object" and has("tests")) then
+      # merge the default parameters into each test cases
+      . as $root
+      | ($root.defaults // {}) as $d
+      | ($root.tests // [])[]
+      # default qps / max_concurrency from defaults if missing
+      | .qps_list = (.qps_list // $d.qps_list)
+      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+      # merge envs / params: test overrides defaults
+      | .server_environment_variables =
+          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+      | .server_parameters =
+          (($d.server_parameters // {}) + (.server_parameters // {}))
+      | .client_parameters =
+          (($d.client_parameters // {}) + (.client_parameters // {}))
+    else
+      error("Unsupported serving test file format: must be array or object with .tests")
+    end
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -323,28 +318,33 @@ run_serving_tests() {
      continue
    fi

-    # get client and server arguments
+    # get client and server arguments (after merged the default parameters)
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
+
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
+
+    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
+
+    # max_concurrency_list (fallback to num_prompts if missing)
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
+      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+      max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -393,6 +393,11 @@ run_serving_tests() {
      fi
    fi

+    # save the compilation mode and optimization level on the serving results
+    # whenever they are set
+    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
+    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
+
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
@@ -406,15 +411,15 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
+        # pass the tensor parallel size, the compilation mode, and the optimization
+        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
+          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
@@ -446,11 +451,12 @@ run_serving_tests() {
 main() {
  local ARCH
  ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
  else
     check_gpus
+     ARCH="$arch_suffix"
  fi
  check_hf_token

@@ -469,11 +475,17 @@ main() {
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
+
+  # dump vllm info via vllm collect-env
+  env_output=$(vllm collect-env)
+
+  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

  # postprocess benchmarking results
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,130 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -0,0 +1,283 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -0,0 +1,82 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -0,0 +1,61 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,171 +1,713 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-  - label: "Build arm64 wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-arm64-cuda-12-9
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build wheel - CUDA 12.6"
-    depends_on: ~
-    id: build-wheel-cuda-12-6
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-cuda-12-9
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build release image (x86)"
-    depends_on: ~
-    id: build-release-image-x86
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-      # re-tag to default image tag and push, just in case arm64 build fails
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-  - label: "Build release image (arm64)"
-    depends_on: ~
-    id: build-release-image-arm64
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
-    depends_on:
-      - build-release-image-x86
-      - build-release-image-arm64
-    id: create-multi-arch-manifest
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Annotate release workflow"
-    depends_on:
-      - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
-    id: annotate-release-workflow
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-release.sh"
-
-  - label: "Build and publish TPU release image"
-    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: tpu_queue_postmerge
-    commands:
-      - "yes | docker system prune -a"
-      - "git fetch --all"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
-      - "docker push vllm/vllm-tpu:nightly"
-      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

-  - block: "Build CPU release image"
-    key: block-cpu-release-image-build
+  - group: "Build Python wheels"
+    key: "build-wheels"
+    steps:
+      - label: "Build wheel - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-12-9
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - aarch64 - CPU"
+        depends_on: ~
+        id: build-wheel-arm64-cpu
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-x86-cuda-12-9
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CPU"
+        depends_on: ~
+        id: build-wheel-x86-cpu
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+  - group: "Build release Docker images"
+    key: "build-release-images"
+    steps:
+      - label: "Build release image - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-x86
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Build release image - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-arm64
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+      - label: "Build release image - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Build release image - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+      - block: "Build release image for x86_64 CPU"
+        key: block-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - CPU"
+        depends_on:
+          - block-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - block: "Build release image for arm64 CPU"
+        key: block-arm64-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - arm64 - CPU"
+        depends_on: 
+          - block-arm64-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+  - group: "Publish release images"
+    key: "publish-release-images"
+    steps:
+      - label: "Create multi-arch manifest - CUDA 12.9"
+        depends_on:
+          - build-release-image-x86
+          - build-release-image-arm64
+        id: create-multi-arch-manifest
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Annotate release workflow - CUDA 12.9"
+        depends_on:
+          - create-multi-arch-manifest
+        id: annotate-release-workflow
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/annotate-release.sh"
+
+      - label: "Create multi-arch manifest - CUDA 13.0"
+        depends_on:
+          - build-release-image-x86-cuda-13-0
+          - build-release-image-arm64-cuda-13-0
+        id: create-multi-arch-manifest-cuda-13-0
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Publish nightly multi-arch image to DockerHub"
+        depends_on:
+          - create-multi-arch-manifest
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
+        depends_on:
+          - create-multi-arch-manifest-cuda-13-0
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+  - group: "Publish wheels"
+    key: "publish-wheels"
+    steps:
+      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+        key: block-upload-release-wheels
+        depends_on:
+          - input-release-version
+          - build-wheels
+
+      - label: "Upload release wheels to PyPI"
+        depends_on:
+          - block-upload-release-wheels
+        id: upload-release-wheels
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+
+  # =============================================================================
+  # ROCm Release Pipeline (x86_64 only)
+  # =============================================================================
+  #
+  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
+  # To build a specific version, trigger the build from that branch/tag.
+  #
+  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
+  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
+  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
+  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
+  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
+  #
+  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
+  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
+  #
+  # =============================================================================
+
+  # ROCm Input Step - Collect build configuration (manual trigger only)
+  - input: "ROCm Wheel Release Build Configuration"
+    key: input-rocm-config
    depends_on: ~
+    if: build.source == "ui"
+    fields:
+      - text: "Python Version"
+        key: "rocm-python-version"
+        default: "3.12"
+        hint: "Python version (e.g., 3.12)"
+      - text: "GPU Architectures"
+        key: "rocm-pytorch-rocm-arch"
+        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
+        hint: "Semicolon-separated GPU architectures"
+      - select: "Upload Wheels to S3"
+        key: "rocm-upload-wheels"
+        default: "true"
+        options:
+          - label: "No - Build only (nightly/dev)"
+            value: "false"
+          - label: "Yes - Upload to S3 (release)"
+            value: "true"
+      - select: "Force Rebuild Base Wheels"
+        key: "rocm-force-rebuild"
+        default: "false"
+        hint: "Ignore S3 cache and rebuild base wheels from scratch"
+        options:
+          - label: "No - Use cached wheels if available"
+            value: "false"
+          - label: "Yes - Rebuild even if cache exists"
+            value: "true"

-  - label: "Build and publish CPU release image"
-    depends_on: block-cpu-release-image-build
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build and publish nightly multi-arch image to DockerHub"
+  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
+  - label: ":rocm: Build ROCm Base Wheels"
+    id: build-rocm-base-wheels
    depends_on:
-      - create-multi-arch-manifest
-    if: build.env("NIGHTLY") == "1"
+      - step: input-rocm-config
+        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
-      - "docker push vllm/vllm-openai:nightly-aarch64"
-      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest push vllm/vllm-openai:nightly"
-      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      # Clean up old nightly builds (keep only last 14)
-      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
+      # Set configuration and check cache
+      - |
+        set -euo pipefail
+
+        # Get values from meta-data (set by input step) or use defaults
+        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
+        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
+
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Check for force rebuild flag
+        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
+        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
+          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "ROCm Base Wheels Build Configuration"
+        echo "========================================"
+        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
+        echo "========================================"
+
+        # Save resolved config for later jobs
+        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
+        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
+
+        # Check S3 cache for pre-built wheels
+        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
+        echo ""
+        echo "Cache key: $${CACHE_KEY}"
+        echo "Cache path: $${CACHE_PATH}"
+
+        # Save cache key for downstream jobs
+        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
+
+        CACHE_STATUS="miss"
+        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
+          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
+        else
+          echo "Force rebuild requested, skipping cache check"
+        fi
+
+        if [ "$${CACHE_STATUS}" = "hit" ]; then
+          echo ""
+          echo "CACHE HIT! Downloading pre-built wheels..."
+          echo ""
+          .buildkite/scripts/cache-rocm-base-wheels.sh download
+
+          # Set the S3 path for the cached Docker image (for Job 2 to download)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we used cache (for Docker image handling)
+          buildkite-agent meta-data set "rocm-used-cache" "true"
+
+          echo ""
+          echo "Cache download complete. Skipping Docker build."
+          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+        else
+          echo ""
+          echo "CACHE MISS. Building from scratch..."
+          echo ""
+
+          # Build full base image (for later vLLM build)
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Build debs_wheel_release stage for wheel extraction
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
+            --target debs_wheel_release \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Extract wheels from Docker image
+          mkdir -p artifacts/rocm-base-wheels
+          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
+          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
+          docker rm $${container_id}
+          echo "Extracted base wheels:"
+          ls -lh artifacts/rocm-base-wheels/
+
+          # Upload wheels to S3 cache for future builds
+          echo ""
+          echo "Uploading wheels to S3 cache..."
+          .buildkite/scripts/cache-rocm-base-wheels.sh upload
+
+          # Export base Docker image for reuse in vLLM build
+          mkdir -p artifacts/rocm-docker-image
+          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
+          echo "Docker image size:"
+          ls -lh artifacts/rocm-docker-image/
+
+          # Upload large Docker image to S3 (also cached by cache key)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
+          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Save the S3 path for downstream jobs
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we did NOT use cache
+          buildkite-agent meta-data set "rocm-used-cache" "false"
+
+          echo ""
+          echo "Build complete. Wheels cached for future builds."
+        fi
+    artifact_paths:
+      - "artifacts/rocm-base-wheels/*.whl"
    env:
      DOCKER_BUILDKIT: "1"
-      DOCKERHUB_USERNAME: "vllmbot"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 2: Build vLLM ROCm Wheel
+  - label: ":python: Build vLLM ROCm Wheel - x86_64"
+    id: build-rocm-vllm-wheel
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 180
+    commands:
+      # Download artifacts and prepare Docker image
+      - |
+        set -euo pipefail
+
+        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
+        # This fixes version detection when tags are moved/force-pushed
+        echo "Fetching latest tags from origin..."
+        git fetch --tags --force origin
+        
+        # Log tag information for debugging version detection
+        echo "========================================"
+        echo "Git Tag Verification"
+        echo "========================================"
+        echo "Current HEAD: $(git rev-parse HEAD)"
+        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
+        echo ""
+        echo "Recent tags (pointing to commits near HEAD):"
+        git tag -l --sort=-creatordate | head -5
+        echo "setuptools_scm version detection:"
+        pip install -q setuptools_scm 2>/dev/null || true
+        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
+        echo "========================================"
+
+        # Download wheel artifacts from current build
+        echo "Downloading wheel artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+
+        # Download Docker image from S3 (too large for Buildkite artifacts)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          echo "This should have been set by the build-rocm-base-wheels job"
+          exit 1
+        fi
+        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image and capture the tag
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        echo "$${LOAD_OUTPUT}"
+        # Extract the actual loaded image tag from "Loaded image: <tag>" output
+        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        if [ -z "$${BASE_IMAGE_TAG}" ]; then
+          echo "ERROR: Failed to extract image tag from docker load output"
+          echo "Load output was: $${LOAD_OUTPUT}"
+          exit 1
+        fi
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Prepare base wheels for Docker build context
+        mkdir -p docker/context/base-wheels
+        touch docker/context/base-wheels/.keep
+        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
+        echo "Base wheels for vLLM build:"
+        ls -lh docker/context/base-wheels/
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        echo "========================================"
+        echo "Building vLLM wheel with:"
+        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
+        echo "========================================"
+
+        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
+        DOCKER_BUILDKIT=1 docker build \
+          --file docker/Dockerfile.rocm \
+          --target export_vllm_wheel_release \
+          --output type=local,dest=rocm-dist \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg REMOTE_VLLM=0 \
+          --build-arg GIT_REPO_CHECK=1 \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          .
+
+        echo "Built vLLM wheel:"
+        ls -lh rocm-dist/*.whl
+
+        # Copy wheel to artifacts directory
+        mkdir -p artifacts/rocm-vllm-wheel
+        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
+        echo "Final vLLM wheel:"
+        ls -lh artifacts/rocm-vllm-wheel/
+    artifact_paths:
+      - "artifacts/rocm-vllm-wheel/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 3: Upload Wheels to S3
+  - label: ":s3: Upload ROCm Wheels to S3"
+    id: upload-rocm-wheels
+    depends_on:
+      - step: build-rocm-vllm-wheel
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      # Download all wheel artifacts and run upload
+      - |
+        set -euo pipefail
+
+        # Check if upload is enabled (from env var, meta-data, or release branch)
+        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
+        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
+          # Try to get from meta-data (input form)
+          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "Upload check:"
+        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "========================================"
+
+        # Skip upload if not enabled
+        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
+          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
+          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
+          exit 0
+        fi
+
+        echo "Upload enabled, proceeding..."
+
+        # Download artifacts from current build
+        echo "Downloading artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
+
+        # Run upload script
+        bash .buildkite/scripts/upload-rocm-wheels.sh
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 4: Annotate ROCm Wheel Release
+  - label: ":memo: Annotate ROCm wheel release"
+    id: annotate-rocm-release
+    depends_on:
+      - step: upload-rocm-wheels
+        allow_failure: true
+      - step: input-release-version
+        allow_failure: true
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-rocm-release.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
+  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
+  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
+  - block: "Generate Root Index for ROCm Wheels for Release"
+    key: block-generate-root-index-rocm-wheels
+    depends_on: upload-rocm-wheels
+
+  - label: ":package: Generate Root Index for ROCm Wheels for Release"
+    depends_on: block-generate-root-index-rocm-wheels
+    id: generate-root-index-rocm-wheels
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
+      VARIANT: "rocm700"
+
+  # ROCm Job 5: Build ROCm Release Docker Image
+  - label: ":docker: Build release image - x86_64 - ROCm"
+    id: build-rocm-release-image
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      - |
+        set -euo pipefail
+
+        # Login to ECR
+        aws ecr-public get-login-password --region us-east-1 | \
+          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+        # Download Docker image from S3 (set by build-rocm-base-wheels)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          exit 1
+        fi
+
+        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Tag and push the base image to ECR
+        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Build vLLM ROCm release image using cached base
+        DOCKER_BUILDKIT=1 docker build \
+          --build-arg max_jobs=16 \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
+          --target vllm-openai \
+          --progress plain \
+          -f docker/Dockerfile.rocm .
+
+        # Push to ECR
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
+        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -2,29 +2,41 @@

 set -ex

-# Get release version and strip leading 'v' if present
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
-
-if [ -z "$RELEASE_VERSION" ]; then
-  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
-  exit 1
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
 fi

 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
-To download the wheel:
+To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+(Optional) For CUDA 13.0:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
+
+(Optional) For CPU:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`

+
 To download and upload the image:

 \`\`\`
+Download images:
+
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+
+Tag and push images:

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -32,15 +44,69 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+
+Create multi-arch manifest:
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+# CPU images (vllm/vllm-openai-cpu)
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Generate Buildkite annotation for ROCm wheel release
+set -ex
+
+# Get build configuration from meta-data
+# Extract ROCm version dynamically from Dockerfile.rocm_base
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
+ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
+PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
+PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# TODO: Enable the nightly build for ROCm
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
+fi
+
+# S3 URLs
+S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
+S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
+
+# Format ROCm version for path (e.g., "7.1" -> "rocm710")
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
+buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
+## ROCm Wheel and Docker Image Releases
+### Build Configuration
+| Setting | Value |
+|---------|-------|
+| **ROCm Version** | ${ROCM_VERSION} |
+| **Python Version** | ${PYTHON_VERSION} |
+| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
+| **Branch** | \`${BUILDKITE_BRANCH}\` |
+| **Commit** | \`${BUILDKITE_COMMIT}\` |
+
+### :package: Installation
+
+**Install from this build (by commit):**
+
+\`\`\`bash
+pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+
+# Example for ROCm ${ROCM_VERSION}:
+pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+\`\`\`
+
+**Install from nightly (if published):**
+
+\`\`\`bash
+pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+\`\`\`
+
+### :floppy_disk: Download Wheels Directly
+
+\`\`\`bash
+# List all ROCm wheels
+aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
+# Download specific wheels
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
+\`\`\`
+
+### :gear: Included Packages
+- **vllm**: vLLM with ROCm support
+- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
+- **triton**: Triton
+- **triton-kernels**: Triton kernels
+- **torchvision**: TorchVision for ROCm PyTorch
+- **torchaudio**: Torchaudio for ROCm PyTorch
+- **amdsmi**: AMD SMI Python bindings
+- **aiter**: Aiter for ROCm
+- **flash-attn**: Flash Attention for ROCm
+
+### :warning: Notes
+- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
+- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
+- Platform: Linux x86_64 only
+
+### :package: Docker Image Release
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+\`\`\`
+
+EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Cache helper for ROCm base wheels
+#
+# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
+# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
+#
+# Usage:
+#   cache-rocm-base-wheels.sh check    - Check if cache exists, outputs "hit" or "miss"
+#   cache-rocm-base-wheels.sh upload   - Upload wheels to cache
+#   cache-rocm-base-wheels.sh download - Download wheels from cache
+#   cache-rocm-base-wheels.sh key      - Output the cache key
+#
+# Environment variables:
+#   S3_BUCKET          - S3 bucket name (default: vllm-wheels)
+#   PYTHON_VERSION     - Python version (affects cache key)
+#   PYTORCH_ROCM_ARCH  - GPU architectures (affects cache key)
+#
+# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
+#       so changes to ROCm version are captured by the Dockerfile hash.
+
+set -euo pipefail
+
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+DOCKERFILE="docker/Dockerfile.rocm_base"
+CACHE_PREFIX="rocm/cache"
+
+# Generate hash from Dockerfile content + build args
+generate_cache_key() {
+    # Include Dockerfile content
+    if [[ ! -f "$DOCKERFILE" ]]; then
+        echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
+        exit 1
+    fi
+    local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
+
+    # Include key build args that affect the output
+    # These should match the ARGs in Dockerfile.rocm_base that change the build output
+    # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
+    local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
+    local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
+
+    echo "${dockerfile_hash}-${args_hash}"
+}
+
+CACHE_KEY=$(generate_cache_key)
+CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
+
+case "${1:-}" in
+    check)
+        echo "Checking cache for key: ${CACHE_KEY}" >&2
+        echo "Cache path: ${CACHE_PATH}" >&2
+        echo "Variables used in cache key:" >&2
+        echo "  PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
+        echo "  PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
+
+        # Check if cache exists by listing objects
+        # We look for at least one .whl file
+        echo "Running: aws s3 ls ${CACHE_PATH}" >&2
+        S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
+        echo "S3 ls output:" >&2
+        echo "$S3_OUTPUT" | head -5 >&2
+
+        if echo "$S3_OUTPUT" | grep -q "\.whl"; then
+            echo "hit"
+        else
+            echo "miss"
+        fi
+        ;;
+
+    upload)
+        echo "========================================"
+        echo "Uploading wheels to cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
+            echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
+            exit 1
+        fi
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
+            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
+            exit 1
+        fi
+
+        echo "Uploading $WHEEL_COUNT wheels..."
+        aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
+
+        echo ""
+        echo "Cache upload complete!"
+        echo "========================================"
+        ;;
+
+    download)
+        echo "========================================"
+        echo "Downloading wheels from cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        mkdir -p artifacts/rocm-base-wheels
+        aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
+
+        echo ""
+        echo "Downloaded wheels:"
+        ls -lh artifacts/rocm-base-wheels/
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        echo ""
+        echo "Total: $WHEEL_COUNT wheels"
+        echo "========================================"
+        ;;
+
+    key)
+        echo "${CACHE_KEY}"
+        ;;
+
+    path)
+        echo "${CACHE_PATH}"
+        ;;
+
+    *)
+        echo "Usage: $0 {check|upload|download|key|path}" >&2
+        echo "" >&2
+        echo "Commands:" >&2
+        echo "  check    - Check if cache exists, outputs 'hit' or 'miss'" >&2
+        echo "  upload   - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
+        echo "  download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
+        echo "  key      - Output the cache key" >&2
+        echo "  path     - Output the full S3 cache path" >&2
+        exit 1
+        ;;
+esac
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+#
+# cherry-pick-from-milestone.sh
+# Find commits from a GitHub milestone that are missing from the current branch
+# and output them in chronological order for cherry-picking.
+#
+# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") <milestone> [options]
+
+Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
+
+Arguments:
+    milestone       The GitHub milestone name (e.g., v0.14.0)
+
+Options:
+    --dry-run       Show the cherry-pick commands without executing (default)
+    --execute       Actually execute the cherry-picks
+    --main-branch   Specify the main branch name (default: main)
+    --help          Show this help message
+
+Examples:
+    $(basename "$0") v0.14.0
+    $(basename "$0") v0.14.0 --dry-run
+    $(basename "$0") v0.14.0 --execute
+    $(basename "$0") v0.14.0 --main-branch master
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Default values
+MILESTONE=""
+DRY_RUN=true
+MAIN_BRANCH="main"
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --main-branch)
+            MAIN_BRANCH="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            if [[ -z "$MILESTONE" ]]; then
+                MILESTONE="$1"
+            else
+                log_error "Unexpected argument: $1"
+                usage
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Validate milestone argument
+if [[ -z "$MILESTONE" ]]; then
+    log_error "Milestone is required"
+    usage
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Check if gh CLI is available
+if ! command -v gh &>/dev/null; then
+    log_error "GitHub CLI (gh) is not installed"
+    exit 1
+fi
+
+# Check if authenticated with gh
+if ! gh auth status &>/dev/null; then
+    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
+    exit 1
+fi
+
+CURRENT_BRANCH=$(git branch --show-current)
+log_info "Current branch: ${CURRENT_BRANCH}"
+log_info "Main branch: ${MAIN_BRANCH}"
+log_info "Milestone: ${MILESTONE}"
+echo ""
+
+# Fetch latest from remote
+log_info "Fetching latest from remote..."
+git fetch origin "$MAIN_BRANCH" --quiet
+
+# Get merged PRs from the milestone, sorted by merge date
+log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
+
+# Store PR data in a temp file
+PR_DATA=$(mktemp)
+trap "rm -f $PR_DATA" EXIT
+
+if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
+    --limit 1000 \
+    --json number,title,mergeCommit,mergedAt \
+    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
+    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
+    log_error "This could be due to:"
+    log_error "  - Milestone does not exist"
+    log_error "  - Network/authentication issues"
+    log_error "  - Invalid milestone name format"
+    exit 1
+fi
+
+if [[ ! -s "$PR_DATA" ]]; then
+    log_warn "No merged PRs found for milestone '${MILESTONE}'"
+    exit 0
+fi
+
+TOTAL_PRS=$(wc -l < "$PR_DATA")
+log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
+echo ""
+
+# Find commits that are missing from current branch
+MISSING_COMMITS=()
+MISSING_INFO=()
+
+while IFS=$'\t' read -r sha pr_number title; do
+    # Skip if SHA is empty or null
+    if [[ -z "$sha" || "$sha" == "null" ]]; then
+        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
+        continue
+    fi
+    
+    # Check if this commit is already in the current branch
+    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
+        log_success "PR #${pr_number} already in branch: ${title:0:60}"
+    else
+        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
+        MISSING_COMMITS+=("$sha")
+        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
+    fi
+done < "$PR_DATA"
+
+echo ""
+
+if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
+    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
+    exit 0
+fi
+
+log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
+echo ""
+
+# Output the cherry-pick commands
+echo "=========================================="
+echo "Cherry-pick commands (in chronological order):"
+echo "=========================================="
+echo ""
+
+for info in "${MISSING_INFO[@]}"; do
+    echo "# $info"
+done
+echo ""
+
+echo "# Run these commands to cherry-pick all missing commits:"
+echo "git cherry-pick ${MISSING_COMMITS[*]}"
+echo ""
+
+# Or one by one
+echo "# Or cherry-pick one at a time:"
+for sha in "${MISSING_COMMITS[@]}"; do
+    echo "git cherry-pick $sha"
+done
+echo ""
+
+# Execute if requested
+if [[ "$DRY_RUN" == false ]]; then
+    echo "=========================================="
+    log_info "Executing cherry-picks..."
+    echo "=========================================="
+    
+    for i in "${!MISSING_COMMITS[@]}"; do
+        sha="${MISSING_COMMITS[$i]}"
+        info="${MISSING_INFO[$i]}"
+        
+        echo ""
+        log_info "Cherry-picking: $info"
+        
+        if git cherry-pick "$sha"; then
+            log_success "Successfully cherry-picked $sha"
+        else
+            log_error "Failed to cherry-pick $sha"
+            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
+            exit 1
+        fi
+    done
+    
+    echo ""
+    log_success "All cherry-picks completed successfully!"
+else
+    echo "=========================================="
+    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
+    echo "Run with --execute to perform the cherry-picks"
+    echo "=========================================="
+fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,7 +3,14 @@
 set -ex

 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+# This script uses DockerHub API to list and delete old tags with specified prefix
+# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
+# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
+
+# Get tag prefix from argument, default to "nightly-" if not provided
+TAG_PREFIX="${1:-nightly-}"
+
+echo "Cleaning up tags with prefix: $TAG_PREFIX"

 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
        
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+import regex as re
+
+
+def normalize_package_name(name: str) -> str:
+    """
+    Normalize package name according to PEP 503.
+    https://peps.python.org/pep-0503/#normalized-names
+
+    Replace runs of underscores, hyphens, and periods with a single hyphen,
+    and lowercase the result.
+    """
+    return re.sub(r"[-_.]+", "-", name).lower()
+
+
+if not sys.version_info >= (3, 12):
+    raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <!-- {comment} -->
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version_part, suffix = version.split("+", 1)
+            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
+            # Git hashes and other suffixes are NOT variants
+            if suffix.startswith(("rocm", "cu", "cpu")):
+                variant = suffix
+                version = version_part
+            # Otherwise keep the full version string (variant stays None)
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
+    """
+    Generate project list HTML content linking to each project & variant subdirectory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    comment: str = "",
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+        # NOTE: this is AWS S3 specific behavior!
+        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
+        file_meta = asdict(file)
+        file_meta["path"] = file_path_quoted
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+    comment: str = "",
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+        comment (str | None): Optional comment to include in the generated HTML files.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant subdirectory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant subdirectory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # For ROCm builds: inherit variant from vllm wheel
+    # All ROCm wheels should share the same variant as vllm
+    rocm_variant = None
+    for file in parsed_files:
+        if (
+            file.package_name == "vllm"
+            and file.variant
+            and file.variant.startswith("rocm")
+        ):
+            rocm_variant = file.variant
+            print(f"Detected ROCm variant from vllm: {rocm_variant}")
+            break
+
+    # Apply ROCm variant to all wheels without a variant
+    if rocm_variant:
+        for file in parsed_files:
+            if file.variant is None:
+                file.variant = rocm_variant
+                print(f"Inherited variant '{rocm_variant}' for {file.filename}")
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate comment in HTML header
+    comment_str = f" ({comment})" if comment else ""
+    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant (normalized per PEP 503)
+        packages = set(normalize_package_name(f.package_name) for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only (compare normalized names)
+            package_files = [
+                f for f in files if normalize_package_name(f.package_name) == package
+            ]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir, comment
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+        --comment <comment_string> : (optional) comment string to include in generated HTML files
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+    parser.add_argument(
+        "--comment",
+        type=str,
+        default="",
+        help="Optional comment string to include in generated HTML files",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    # Allow rocm/ prefix, reject other slashes and all backslashes
+    if "\\" in version:
+        raise ValueError("Version string must not contain backslashes.")
+    if "/" in version and not version.startswith("rocm/"):
+        raise ValueError(
+            "Version string must not contain slashes (except for 'rocm/' prefix)."
+        )
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # keep only "official" files for a non-nightly version (specified by cli args)
+    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+    if PY_VERSION_RE.match(version):
+        # upload-wheels.sh ensures no "dev" is in args.version
+        wheel_files = list(
+            filter(lambda x: version in x and "dev" not in x, wheel_files)
+        )
+        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+    else:
+        print("Nightly version detected, keeping all wheel files.")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    #
+    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
+    #   - rocm/{commit}/  (same as wheels)
+    #   - rocm/nightly/
+    #   - rocm/{version}/
+    # All these are under the "rocm/" prefix, so relative paths should be
+    # relative to "rocm/", not the bucket root.
+    if args.wheel_dir:
+        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
+        wheel_dir = args.wheel_dir.strip().rstrip("/")
+    elif version.startswith("rocm/"):
+        # For rocm/commit, wheel_base_dir should be just the commit part
+        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
+        wheel_dir = version.split("/", 1)[1]
+    else:
+        wheel_dir = version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+        comment=args.comment.strip(),
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -44,6 +44,17 @@ cleanup_docker() {
  fi
 }

+cleanup_network() {
+  for node in $(seq 0 $((NUM_NODES-1))); do
+    if docker pr -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}"
+    fi
+  done
+  if docker network ls | grep docker-net; then
+    docker network rm docker-net
+  fi
+}
+
 # Call the cleanup docker function
 cleanup_docker

@@ -59,7 +70,7 @@ while true; do
        fi
 done

-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -76,19 +87,15 @@ mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
-echo "Commands:$commands"
+echo "Raw commands: $commands"

-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}

 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
@@ -145,7 +152,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
@@ -163,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+commands=$(echo "$commands" | sed 's/ \\ / /g')
+echo "Final commands: $commands"
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -170,55 +179,52 @@ fi
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13


-PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
-    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
-        --shm-size=16gb \
-        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
-        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    fi
-  done
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+
+  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
+
+  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
+      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+      echo "PREFIX: ${prefix}"
+      export composite_command="(command rocm-smi || true)"
+      myIFS=$IFS
+      IFS=','
+      read -ra node0 <<< ${BASH_REMATCH[2]}
+      read -ra node1 <<< ${BASH_REMATCH[3]}
+      IFS=$myIFS
+      for i in "${!node0[@]}";do 
+        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
+        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
+        
+        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+        echo "COMMANDS: ${commands}"
+        composite_command=$(echo "${composite_command} && ${commands}")
+      done
+      /bin/bash -c "${composite_command}"
+      cleanup_network
+  else
+      echo "Failed to parse node commands! Exiting."
+      cleanup_network
+      exit 111
+  fi
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
          --network=host \
          --shm-size=16gb \
+          --group-add "$render_gid" \
          --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=16
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
+
+function cpu_tests() {
+  set -e
+
+  docker exec cpu-test bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run model tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
+  # Run kernel tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
+
+  # basic online serving
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3-0.6B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {

  # offline inference
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -xve
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 

    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -49,6 +49,8 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
@@ -70,20 +72,19 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

-  # Note: disable it until supports V1
-  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/lora/test_qwen2vl.py"
+    tests/lora/test_qwenvl.py"

  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
@@ -116,4 +117,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,9 @@
 set -exuo pipefail

 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
+container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
+RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -36,15 +39,20 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  hpu-plugin-v1-test-env \
-  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+  -e VLLM_SKIP_WARMUP=true \
+  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+  -e PT_HPU_LAZY_MODE=1 \
+  "${image_name}" \
+  /bin/bash -c '
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+'

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"

 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -20,7 +20,10 @@ trap remove_docker_container EXIT

 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri \
+    --device /dev/dri:/dev/dri \
+    --net=host \
+    --ipc=host \
+    --privileged \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
@@ -32,17 +35,19 @@ docker run \
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -ex
+
+# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
+# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
+# otherwise they will be cleaned up together with the main "nightly" tags.
+
+TAG_VARIANT="$1"
+if [ -n "$TAG_VARIANT" ]; then
+    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
+    TAG_NAME="$TAG_VARIANT-nightly"
+else
+    ORIG_TAG_SUFFIX=""
+    TAG_NAME="nightly"
+fi
+
+ORIG_TAG_NAME="$BUILDKITE_COMMIT"
+
+echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
+
+# pull original arch-dependent images from AWS ECR Public
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+# tag arch-dependent images
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-dependent images to DockerHub
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-independent manifest to DockerHub
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -2,6 +2,17 @@

 set -euox pipefail

+# To detect ROCm
+# Check multiple indicators:
+if [ -e /dev/kfd ] || \
+    [ -d /opt/rocm ] || \
+    command -v rocm-smi &> /dev/null || \
+    [ -n "${ROCM_HOME:-}" ]; then
+    IS_ROCM=1
+else
+    IS_ROCM=0
+fi
+
 if [[ $# -lt 4 ]]; then
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
    echo "$command"
 done

+
 start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
 }

 start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        GPU_DEVICES='"device='
+        if [ "$IS_ROCM" -eq 1 ]; then
+            GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
+        else
+            GPU_DEVICES='--gpus "device='
+        fi
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
@@ -40,7 +56,9 @@ start_nodes() {
                GPU_DEVICES+=','
            fi
        done
-        GPU_DEVICES+='"'
+        if [ "$IS_ROCM" -eq 0 ]; then
+            GPU_DEVICES+='"'
+        fi

        # start the container in detached mode
        # things to note:
@@ -49,7 +67,7 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+        docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"

+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
 echo "Setting up Prime-RL integration test environment..."

 # Clean up any existing Prime-RL directory
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8010}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --all2all-backend $BACK \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
+THRESHOLD=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --enable-eplb \
+    --all2all-backend $BACK \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
+    --enable-expert-parallel \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS and platform-specific args based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+  PLATFORM_ARGS=("--no-async-scheduling")
+  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+  PLATFORM_ARGS=()
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --all2all-backend $BACK \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    "${PLATFORM_ARGS[@]}" \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+#
+# trigger-ci-build.sh
+# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
+# with RUN_ALL=1 and NIGHTLY=1 environment variables.
+#
+# Usage: ./trigger-ci-build.sh [options]
+#
+# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
+#
+# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Default configuration
+PIPELINE="ci"
+DRY_RUN=true
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [options]
+
+Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
+Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
+
+SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+
+Options:
+    --execute       Actually trigger the build (default: dry-run)
+    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
+    --commit        Override commit SHA (default: current HEAD)
+    --branch        Override branch name (default: current branch)
+    --message       Custom build message (default: auto-generated)
+    --help          Show this help message
+
+Prerequisites:
+    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
+    - bk configured: bk configure
+
+Examples:
+    $(basename "$0")                        # Dry-run, show what would happen
+    $(basename "$0") --execute              # Actually trigger the build
+    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Parse arguments
+COMMIT=""
+BRANCH=""
+MESSAGE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --pipeline)
+            PIPELINE="$2"
+            shift 2
+            ;;
+        --commit)
+            COMMIT="$2"
+            shift 2
+            ;;
+        --branch)
+            BRANCH="$2"
+            shift 2
+            ;;
+        --message)
+            MESSAGE="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            log_error "Unexpected argument: $1"
+            usage
+            ;;
+    esac
+done
+
+# Check if bk CLI is installed
+if ! command -v bk &>/dev/null; then
+    log_error "Buildkite CLI (bk) is not installed"
+    echo ""
+    echo "Install with:"
+    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
+    echo ""
+    echo "Then configure:"
+    echo "  bk configure"
+    exit 1
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Get current commit and branch if not overridden
+if [[ -z "$COMMIT" ]]; then
+    COMMIT=$(git rev-parse HEAD)
+fi
+
+if [[ -z "$BRANCH" ]]; then
+    BRANCH=$(git branch --show-current)
+    if [[ -z "$BRANCH" ]]; then
+        # Detached HEAD state - try to get branch from ref
+        BRANCH=$(git rev-parse --abbrev-ref HEAD)
+    fi
+fi
+
+# Generate default message if not provided
+if [[ -z "$MESSAGE" ]]; then
+    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
+    MESSAGE="[Manual] ${COMMIT_MSG}"
+fi
+
+# Safety check: Verify the commit exists on the remote
+log_info "Verifying commit exists on remote..."
+git fetch origin --quiet 2>/dev/null || true
+
+# Check if commit is reachable from any remote branch
+REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
+if [[ -z "$REMOTE_BRANCHES" ]]; then
+    log_error "Commit ${COMMIT} does not exist on any remote branch!"
+    echo ""
+    echo "The CI system will fail to checkout this commit."
+    echo "Please push your changes first:"
+    echo ""
+    echo "  git push origin ${BRANCH}"
+    echo ""
+    exit 1
+fi
+
+log_success "Commit found on remote branches:"
+echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
+if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
+    echo "  ... and more"
+fi
+echo ""
+
+log_info "Pipeline: ${PIPELINE}"
+log_info "Branch: ${BRANCH}"
+log_info "Commit: ${COMMIT}"
+log_info "Message: ${MESSAGE}"
+log_info "Environment: RUN_ALL=1, NIGHTLY=1"
+echo ""
+
+# Build the command
+CMD=(bk build create
+    -y
+    -w
+    -i
+    --pipeline "${PIPELINE}"
+    --commit "${COMMIT}"
+    --branch "${BRANCH}"
+    --message "${MESSAGE}"
+    --env "RUN_ALL=1"
+    --env "NIGHTLY=1"
+)
+
+if [[ "$DRY_RUN" == true ]]; then
+    echo "=========================================="
+    log_warn "DRY-RUN MODE - No build will be triggered"
+    echo "=========================================="
+    echo ""
+    echo "Command that would be executed:"
+    echo ""
+    # Escape single quotes in values for safe shell display
+    escape_for_shell() {
+        printf '%s' "$1" | sed "s/'/'\\\\''/g"
+    }
+    echo "  bk build create \\"
+    echo "    -y \\"
+    echo "    -w \\"
+    echo "    -i \\"
+    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
+    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
+    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
+    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
+    echo "    --env 'RUN_ALL=1' \\"
+    echo "    --env 'NIGHTLY=1'"
+    echo ""
+    echo "=========================================="
+    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
+    echo ""
+    echo "  $0 --execute"
+    echo "=========================================="
+    exit 0
+fi
+
+log_info "Triggering build..."
+
+# Execute the command - bk will print the URL and open browser
+"${CMD[@]}"
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+wheel="${wheel_files[0]}"
+
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="${1:-manylinux_2_31}"
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
+new_wheel="${wheel/linux/$manylinux_version}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
+
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
+
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+    alias_arg=""
+fi
+
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
+
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
+fi
+
+# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Re-generating indices for /$pure_version/"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
+    mkdir -p "$INDICES_OUTPUT_DIR"
+    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+
+echo "Release version from Buildkite: $RELEASE_VERSION"
+
+if [[ -z "$GIT_VERSION" ]]; then
+    echo "[FATAL] Not on a git tag, cannot create release."
+    exit 1
+else
+    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
+    echo "[WARNING] Force release and ignore version mismatch"
+  else
+    echo "[FATAL] Release version from Buildkite does not match Git version."
+    exit 1
+  fi
+fi
+PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
+
+# check pypi token
+if [[ -z "$PYPI_TOKEN" ]]; then
+  echo "[FATAL] PYPI_TOKEN is not set."
+  exit 1
+else
+  export TWINE_USERNAME="__token__"
+  export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
+aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
+  echo "No default variant wheels found, quitting..."
+  exit 1
+fi
+
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Upload ROCm wheels to S3 with proper index generation
+#
+# Required environment variables:
+#   AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
+#   S3_BUCKET (default: vllm-wheels)
+#
+# S3 path structure:
+#   s3://vllm-wheels/rocm/{commit}/     - All wheels for this commit
+#   s3://vllm-wheels/rocm/nightly/      - Index pointing to latest nightly
+#   s3://vllm-wheels/rocm/{version}/    - Index for release versions
+
+set -ex
+
+# ======== Configuration ========
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
+S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
+INDICES_OUTPUT_DIR="rocm-indices"
+PYTHON="${PYTHON_PROG:-python3}"
+
+# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
+MANYLINUX_VERSION="manylinux_2_35"
+
+echo "========================================"
+echo "ROCm Wheel Upload Configuration"
+echo "========================================"
+echo "S3 Bucket: $BUCKET"
+echo "S3 Path: $ROCM_SUBPATH"
+echo "Commit: $BUILDKITE_COMMIT"
+echo "Branch: $BUILDKITE_BRANCH"
+echo "========================================"
+
+# ======== Part 0: Setup Python ========
+
+# Detect if python3.12+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
+if [[ "$has_new_python" -eq 0 ]]; then
+    # Use new python from docker
+    # Use --user to ensure files are created with correct ownership (not root)
+    docker pull python:3-slim
+    PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ======== Part 1: Collect and prepare wheels ========
+
+# Collect all wheels
+mkdir -p all-rocm-wheels
+cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
+cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
+
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+echo "Total wheels to upload: $WHEEL_COUNT"
+
+if [ "$WHEEL_COUNT" -eq 0 ]; then
+    echo "ERROR: No wheels found to upload!"
+    exit 1
+fi
+
+# Rename linux to manylinux in wheel filenames
+for wheel in all-rocm-wheels/*.whl; do
+    if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
+        new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
+        mv -- "$wheel" "$new_wheel"
+        echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
+    fi
+done
+
+echo ""
+echo "Wheels to upload:"
+ls -lh all-rocm-wheels/
+
+# ======== Part 2: Upload wheels to S3 ========
+
+echo ""
+echo "Uploading wheels to $S3_COMMIT_PREFIX"
+for wheel in all-rocm-wheels/*.whl; do
+    aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+done
+
+# ======== Part 3: Generate and upload indices ========
+
+# List existing wheels in commit directory
+echo ""
+echo "Generating indices..."
+obj_json="rocm-objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
+
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# Use the existing generate-nightly-index.py
+# HACK: Replace regex module with stdlib re (same as CUDA script)
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+
+$PYTHON .buildkite/scripts/generate-nightly-index.py \
+    --version "$ROCM_SUBPATH" \
+    --current-objects "$obj_json" \
+    --output-dir "$INDICES_OUTPUT_DIR" \
+    --comment "ROCm commit $BUILDKITE_COMMIT"
+
+# Upload indices to commit directory
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# Update rocm/nightly/ if on main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Updating rocm/nightly/ index..."
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
+fi
+
+# Extract version from vLLM wheel and update version-specific index
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+if [ -n "$VLLM_WHEEL" ]; then
+    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+    echo "Version in wheel: $VERSION"
+    PURE_VERSION="${VERSION%%+*}"
+    PURE_VERSION="${PURE_VERSION%%.rocm}"
+    echo "Pure version: $PURE_VERSION"
+
+    if [[ "$VERSION" != *"dev"* ]]; then
+        echo "Updating rocm/$PURE_VERSION/ index..."
+        aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
+    fi
+fi
+
+# ======== Part 4: Summary ========
+
+echo ""
+echo "========================================"
+echo "ROCm Wheel Upload Complete!"
+echo "========================================"
+echo ""
+echo "Wheels available at:"
+echo "  s3://$BUCKET/$ROCM_SUBPATH/"
+echo ""
+echo "Install command (by commit):"
+echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
+echo ""
+if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Install command (nightly):"
+    echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
+fi
+echo ""
+echo "Wheel count: $WHEEL_COUNT"
+echo "========================================"
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -1,91 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-# Assume wheels are in artifacts/dist/*.whl
-wheel_files=(artifacts/dist/*.whl)
-
-# Check that exactly one wheel is found
-if [[ ${#wheel_files[@]} -ne 1 ]]; then
-  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
-  exit 1
-fi
-
-# Get the single wheel file
-wheel="${wheel_files[0]}"
-
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
-
-# Rename 'linux' to the appropriate manylinux version in the wheel filename
-new_wheel="${wheel/linux/$manylinux_version}"
-mv -- "$wheel" "$new_wheel"
-wheel="$new_wheel"
-
-# Extract the version from the wheel
-version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
-
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
-
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
-fi
-
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-
-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-fi
-
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.

 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -38,7 +39,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -56,22 +57,30 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/tool_parsers
  - tests/transformers_utils
+  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
+  - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
+  - pytest -v -s config

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@@ -107,7 +116,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
@@ -125,7 +134,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
@@ -137,10 +146,26 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@@ -154,6 +179,18 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling

+- label: Entrypoints Integration Test (Responses API)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai/responses
+  commands:
+  - pytest -v -s entrypoints/openai/responses
+
 - label: Distributed Tests (4 GPUs) # 35min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@@ -164,7 +201,7 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
@@ -172,6 +209,8 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@@ -187,12 +226,13 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -203,6 +243,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@@ -212,8 +270,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_algo.py

- label: EPLB Execution Test # 5min
-  timeout_in_minutes: 15
+- label: EPLB Execution Test # 10min
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -221,6 +279,7 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py

 - label: Metrics, Tracing Test # 12min
  timeout_in_minutes: 20
@@ -251,21 +310,18 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization

 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@@ -277,7 +333,10 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
@@ -295,6 +354,7 @@ steps:
    - vllm/
    - tests/v1
  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -302,15 +362,51 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
@@ -331,25 +427,31 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
+  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@@ -384,7 +486,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
+
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@@ -395,15 +502,14 @@ steps:
    - vllm/
    - tests/compile
  commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -413,18 +519,41 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_full_graph.py
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@@ -441,8 +570,9 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -468,6 +598,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@@ -482,10 +614,82 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+  
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
+  torch_nightly: true
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -527,8 +731,10 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@@ -536,8 +742,9 @@ steps:
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: OpenAI API correctness # 22min
  timeout_in_minutes: 30
@@ -549,25 +756,6 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
-
 #####  models test  #####

 - label: Basic Models Tests (Initialization)
@@ -577,6 +765,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
+  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -587,7 +776,9 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
+  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@@ -677,8 +868,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@@ -711,14 +904,24 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test

- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@@ -733,6 +936,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@@ -785,6 +998,7 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
+  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
@@ -796,27 +1010,29 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 23 min
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
-  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
+  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -828,15 +1044,48 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py
+
+- label: Blackwell Fusion and Compile Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -877,7 +1126,7 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -909,17 +1158,18 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
+  - .buildkite/scripts/run-multi-node-test.sh
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code

 - label: Distributed Tests (2 GPUs) # 68min
  timeout_in_minutes: 90
@@ -934,7 +1184,7 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
@@ -943,13 +1193,17 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -993,13 +1247,18 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins

 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
@@ -1027,11 +1286,15 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1058,6 +1321,28 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

+- label: NixlConnector PD accuracy tests (Distributed) # 40min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+

 ##### multi gpus test #####
 ##### A100 test #####
@@ -1076,6 +1361,20 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 120
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
@@ -1088,17 +1387,51 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

-##### H200 test #####
- label: Distrubted Tests (H200) # optional
-  gpu: h200
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: Sequence Parallel Tests (H100) # 60 min
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+
+- label: Distributed Tests (H100) # optional
+  gpu: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### H200 test #####
+
+- label: LM Eval Large Models (H200) # optional
+  timeout_in_minutes: 60
+  gpu: h200
+  optional: true
+  num_gpus: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt

 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1109,15 +1442,68 @@ steps:
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
+  soft_fail: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### MoE Refactor (Temporary) Tests #####
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+  
+- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,25 @@
+group: Attention
+depends_on: 
+  - image-build
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  device: h100
+  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  device: b200
+  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on: 
+  - image-build
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`