[Bugfix] Fix LoRA extra vocab size (#15047 )

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
[Bugfix] Fix broken CPU quantization due to triton import (#15038 )
2025-03-18 10:51:10 -07:00 · 2025-03-18 10:51:10 -07:00 · 2025-03-18 10:51:10 -07:00 · 2025-03-18 10:51:10 -07:00 · 2025-03-18 10:51:10 -07:00 · 2025-03-18 10:51:10 -07:00
4340 changed files with 272917 additions and 803011 deletions
--- a/.buildkite/.pipeline_gen_v2
+++ b/.buildkite/.pipeline_gen_v2
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,20 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
-# Note that we have 800 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/6326 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
+# Note that we have 400 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))


 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, "r") as z:
+    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
@@ -29,18 +28,14 @@ def check_wheel_size(directory):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(
-                        f"Not allowed: Wheel {wheel_path} is larger "
-                        f"({wheel_size_mb:.2f} MB) than the limit "
-                        f"({VLLM_MAX_SIZE_MB} MB)."
-                    )
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(
-                        f"Wheel {wheel_path} is within the allowed size "
-                        f"({wheel_size_mb:.2f} MB)."
-                    )
+                    print(f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb:.2f} MB).")
    return 0


@@ -50,4 +45,4 @@ if __name__ == "__main__":
        sys.exit(1)

    directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    sys.exit(check_wheel_size(directory))
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,25 +0,0 @@
-name: vllm_ci
-job_dirs:
-  - ".buildkite/image_build"
-  - ".buildkite/test_areas"
-  - ".buildkite/hardware_tests"
-run_all_patterns:
-  - "docker/Dockerfile"
-  - "CMakeLists.txt"
-  - "requirements/common.txt"
-  - "requirements/cuda.txt"
-  - "requirements/build.txt"
-  - "requirements/test.txt"
-  - "setup.py"
-  - "csrc/"
-  - "cmake/"
-run_all_exclude_patterns:
-  - "docker/Dockerfile."
-  - "csrc/cpu/"
-  - "csrc/rocm/"
-  - "cmake/hipify.py"
-  - "cmake/cpu_extension.cmake"
-registries: public.ecr.aws/q9t5s3a7
-repositories:
-  main: "vllm-ci-postmerge-repo"
-  premerge: "vllm-ci-test-repo"
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,29 +0,0 @@
-group: Hardware
-steps:
-  - label: "AMD: :docker: build image"
-    depends_on: []
-    device: amd_cpu
-    no_plugin: true
-    commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 1
-        - exit_status: -10  # Agent was lost
-          limit: 1
-        - exit_status: 1  # Machine occasionally fail
-          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Ascend NPU Test"
-    soft_fail: true
-    timeout_in_minutes: 20
-    no_plugin: true
-    device: ascend_npu
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -1,100 +0,0 @@
-group: CPU
-depends_on: []
-steps:
- label: CPU-Kernel Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - cmake/cpu_extension.cmake
-  - CMakeLists.txt
-  - vllm/_custom_ops.py
-  - tests/kernels/attention/test_cpu_attn.py
-  - tests/kernels/moe/test_cpu_fused_moe.py
-  - tests/kernels/test_onednn.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-      pytest -x -v -s tests/kernels/test_onednn.py"
-
- label: CPU-Language Generation and Pooling Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/
-  - tests/models/language/generation/
-  - tests/models/language/pooling/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
-      pytest -x -v -s tests/models/language/generation -m cpu_model
-      pytest -x -v -s tests/models/language/pooling -m cpu_model"
-
- label: CPU-Quantization Model Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/
-  - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
-  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
-  - tests/quantization/test_compressed_tensors.py
-  - tests/quantization/test_cpu_wna16.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
-      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-      
- label: CPU-Distributed Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - csrc/cpu/shm.cpp
-  - vllm/v1/worker/cpu_worker.py
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/worker/cpu_model_runner.py
-  - vllm/v1/worker/gpu_model_runner.py
-  - vllm/platforms/cpu.py
-  - vllm/distributed/parallel_state.py
-  - vllm/distributed/device_communicators/cpu_communicator.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
-
- label: CPU-Multi-Modal Model Tests %N
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  # - vllm/
-  - vllm/model_executor/layers/rotary_embedding
-  - tests/models/multimodal/generation/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
-      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
-  parallelism: 2
-
- label: "Arm CPU Test"
-  depends_on: []
-  soft_fail: true
-  device: arm_cpu
-  no_plugin: true
-  commands: 
-  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -1,10 +0,0 @@
-group: Hardware
-steps:
-  - label: "GH200 Test"
-    soft_fail: true
-    device: gh200
-    no_plugin: true
-    optional: true
-    commands: 
-    - nvidia-smi 
-    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,17 +0,0 @@
-group: Hardware
-depends_on: ~
-steps:
-  - label: "Intel HPU Test"
-    soft_fail: true
-    device: intel_hpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,256 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-# replace invalid characters in Docker image tags and truncate to 128 chars
-clean_docker_tag() {
-    local input="$1"
-    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
-}
-
-print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
-    exit 1
-}
-
-print_instance_info() {
-    echo ""
-    echo "=== Debug: Instance Information ==="
-    # Get IMDSv2 token
-    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
-        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
-        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
-        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
-        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
-            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
-        echo "AMI ID:        ${AMI_ID}"
-        echo "Instance Type: ${INSTANCE_TYPE}"
-        echo "Instance ID:   ${INSTANCE_ID}"
-        echo "AZ:            ${AZ}"
-    else
-        echo "Not running on EC2 or IMDS not available"
-    fi
-    # Check for warm cache AMI (marker file baked into custom AMI)
-    if [[ -f /etc/vllm-ami-info ]]; then
-        echo "Cache:         warm (custom vLLM AMI)"
-        cat /etc/vllm-ami-info
-    else
-        echo "Cache:         cold (standard AMI)"
-    fi
-    echo "==================================="
-    echo ""
-}
-
-setup_buildx_builder() {
-    echo "--- :buildkite: Setting up buildx builder"
-    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
-        # Custom AMI with standalone buildkitd - use remote driver for warm cache
-        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
-        echo "Using remote driver to connect to buildkitd (warm cache available)"
-        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
-            echo "Using existing baked-vllm-builder"
-            docker buildx use baked-vllm-builder
-        else
-            echo "Creating baked-vllm-builder with remote driver"
-            docker buildx create \
-                --name baked-vllm-builder \
-                --driver remote \
-                --use \
-                "unix://${BUILDKIT_SOCKET}"
-        fi
-        docker buildx inspect --bootstrap
-    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
-        # Existing builder available
-        echo "Using existing builder: ${BUILDER_NAME}"
-        docker buildx use "${BUILDER_NAME}"
-        docker buildx inspect --bootstrap
-    else
-        # No local buildkitd, no existing builder - create new docker-container builder
-        echo "No local buildkitd found, using docker-container driver"
-        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
-        docker buildx inspect --bootstrap
-    fi
-
-    # builder info
-    echo "Active builder:"
-    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
-}
-
-check_and_skip_if_image_exists() {
-    if [[ -n "${IMAGE_TAG:-}" ]]; then
-        echo "--- :mag: Checking if image exists"
-        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
-            echo "Image already exists: ${IMAGE_TAG}"
-            echo "Skipping build"
-            exit 0
-        fi
-        echo "Image not found, proceeding with build"
-    fi
-}
-
-ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
-}
-
-prepare_cache_tags() {
-    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
-    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
-    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
-
-    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-            cache="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
-            cache="${TEST_CACHE_ECR}:${clean_branch}"
-        fi
-        CACHE_TO="$cache"
-        CACHE_FROM="$cache"
-        CACHE_FROM_BASE_BRANCH="$cache"
-    else
-        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
-        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
-            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
-        else
-            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
-            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
-        fi
-    fi
-
-    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
-    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
-}
-
-resolve_parent_commit() {
-    if [[ -z "${PARENT_COMMIT:-}" ]]; then
-        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
-        if [[ -n "${PARENT_COMMIT}" ]]; then
-            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
-            export PARENT_COMMIT
-        else
-            echo "Could not determine parent commit (may be first commit in repo)"
-        fi
-    else
-        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
-    fi
-}
-
-print_bake_config() {
-    echo "--- :page_facing_up: Resolved bake configuration"
-    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
-    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
-    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
-    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
-}
-
-#################################
-#         Main Script           #
-#################################
-print_instance_info
-
-if [[ $# -lt 7 ]]; then
-    print_usage_and_exit
-fi
-
-# input args
-REGISTRY=$1
-REPO=$2
-BUILDKITE_COMMIT=$3
-BRANCH=$4
-VLLM_USE_PRECOMPILED=$5
-VLLM_MERGE_BASE_COMMIT=$6
-IMAGE_TAG=$7
-IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
-
-# build config
-TARGET="test-ci"
-VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
-BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
-CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
-CI_HCL_PATH="/tmp/ci.hcl"
-BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
-
-prepare_cache_tags
-ecr_login
-
-# Environment info (for docs and human readers)
-#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
-#
-# Build configuration (exported as environment variables for bake):
-export BUILDKITE_COMMIT
-export PARENT_COMMIT
-export IMAGE_TAG
-export IMAGE_TAG_LATEST
-export CACHE_FROM
-export CACHE_FROM_BASE_BRANCH
-export CACHE_FROM_MAIN
-export CACHE_TO
-export VLLM_USE_PRECOMPILED
-export VLLM_MERGE_BASE_COMMIT
-
-# print args
-echo "--- :mag: Arguments"
-echo "REGISTRY: ${REGISTRY}"
-echo "REPO: ${REPO}"
-echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
-echo "BRANCH: ${BRANCH}"
-echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
-echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
-echo "IMAGE_TAG: ${IMAGE_TAG}"
-echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
-
-# print build configuration
-echo "--- :mag: Build configuration"
-echo "TARGET: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "BUILDER_NAME: ${BUILDER_NAME}"
-echo "CI_HCL_URL: ${CI_HCL_URL}"
-echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
-
-echo "--- :mag: Cache tags"
-echo "CACHE_TO: ${CACHE_TO}"
-echo "CACHE_FROM: ${CACHE_FROM}"
-echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
-echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
-
-check_and_skip_if_image_exists
-
-echo "--- :docker: Setting up Docker buildx bake"
-echo "Target: ${TARGET}"
-echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
-echo "CI HCL path: ${CI_HCL_PATH}"
-
-if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
-    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
-    echo "Make sure you're running from the vLLM repository root"
-    exit 1
-fi
-
-echo "--- :arrow_down: Downloading ci.hcl"
-curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-echo "Downloaded to ${CI_HCL_PATH}"
-
-if [[ ! -f "${CI_HCL_PATH}" ]]; then
-    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
-    exit 1
-fi
-
-setup_buildx_builder
-
-resolve_parent_commit
-export PARENT_COMMIT
-
-print_bake_config
-
-echo "--- :docker: Building ${TARGET}"
-docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-
-echo "--- :white_check_mark: Build complete"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -1,58 +0,0 @@
-group: Abuild
-steps:
-  - label: ":docker: Build image"
-    key: image-build
-    depends_on: []
-    commands:
-    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-
-  - label: ":docker: Build CPU image"
-    key: image-build-cpu
-    depends_on: []
-    commands:
-    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-
-  - label: ":docker: Build HPU image"
-    soft_fail: true
-    depends_on: []
-    key: image-build-hpu
-    commands:
-    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-  
-  - label: ":docker: Build CPU arm64 image"
-    key: cpu-arm64-image-build
-    depends_on: []
-    optional: true
-    commands:
-    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit>"
-  exit 1
-fi
-
-REGISTRY=$1
-REPO=$2
-BUILDKITE_COMMIT=$3
-
-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-
-# skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
-fi
-
-# build
-docker build --file docker/Dockerfile.cpu \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg VLLM_CPU_AVX512BF16=true \
-  --build-arg VLLM_CPU_AVX512VNNI=true \
-  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
-  --target vllm-test \
-  --progress plain .
-
-# push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -1,33 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit>"
-  exit 1
-fi
-
-REGISTRY=$1
-REPO=$2
-BUILDKITE_COMMIT=$3
-
-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-
-# skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
-fi
-
-# build
-docker build --file docker/Dockerfile.cpu \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
-  --target vllm-test \
-  --progress plain .
-
-# push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit>"
-  exit 1
-fi
-
-REGISTRY=$1
-REPO=$2
-BUILDKITE_COMMIT=$3
-
-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-
-# skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
-fi
-
-# build
-docker build \
-  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
-  --progress plain \
-  https://github.com/vllm-project/vllm-gaudi.git
-
-# push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
-model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.335
-  - name: "exact_match,flexible-extract"
-    value: 0.323
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -1,12 +0,0 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
-model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-backend: "vllm-vlm"
-tasks:
- name: "chartqa"
-  metrics:
-  - name: "relaxed_accuracy,none"
-    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
-    value: 0.80
-limit: 100
-num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -1,11 +0,0 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
-model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-tasks:
- name: "mmlu_pro"
-  metrics:
-  - name: "exact_match,custom-extract"
-    value: 0.80
-limit: 250 # will run on 250 * 14 subjects = 3500 samples
-num_fewshot: 5
-rtol: 0.05
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -1,15 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.695
-  - name: "exact_match,flexible-extract"
-    value: 0.447
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -1,19 +0,0 @@
-model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.7142
-  - name: "exact_match,flexible-extract"
-    value: 0.4579
-env_vars:
-  VLLM_USE_FLASHINFER_MOE_FP8: "1"
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
-limit: 1319
-num_fewshot: 5
-max_model_len: 262144
-kv_cache_dtype: fp8
-enforce_eager: false
-apply_chat_template: true
-fewshot_as_multiturn: true
-trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
-model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.30
-  - name: "exact_match,flexible-extract"
-    value: 0.465
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
-model_name: "Qwen/Qwen2.5-1.5B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.54
-  - name: "exact_match,flexible-extract"
-    value: 0.59
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size)
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
-model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.47
-  - name: "exact_match,flexible-extract"
-    value: 0.64
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
-
-model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
-backend: "vllm-vlm"
-tasks:
- name: "chartqa"
-  metrics:
-  - name: "relaxed_accuracy,none"
-    value: 0.855
-limit: 2500
-num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -1,14 +0,0 @@
-model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
-tasks:
-  - name: "mmlu_pro"
-    metrics:
-      - name: "exact_match,custom-extract"
-        value: 0.82
-limit: 250 # will run on 250 * 14 subjects = 3500 samples
-num_fewshot: 5
-enforce_eager: false # we use false to speed up the eval process
-kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
-max_model_len: 40960
-apply_chat_template: true
-fewshot_as_multiturn: true
-gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1,2 +0,0 @@
-Qwen3-235B-A22B-Instruct-2507-FP8.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -1 +0,0 @@
-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -1 +0,0 @@
-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -1 +0,0 @@
-Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +0,0 @@
-Qwen2.5-1.5B-Instruct.yaml
-Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,6 +1,10 @@
-Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@@ -1,44 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from pathlib import Path
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--config-list-file",
-        action="store",
-        help="Path to the file listing model config YAMLs (one per line)",
-    )
-    parser.addoption(
-        "--tp-size",
-        action="store",
-        default="1",
-        help="Tensor parallel size to use for evaluation",
-    )
-
-
-@pytest.fixture(scope="session")
-def config_list_file(pytestconfig, config_dir):
-    rel_path = pytestconfig.getoption("--config-list-file")
-    return config_dir / rel_path
-
-
-@pytest.fixture(scope="session")
-def tp_size(pytestconfig):
-    return pytestconfig.getoption("--tp-size")
-
-
-def pytest_generate_tests(metafunc):
-    if "config_filename" in metafunc.fixturenames:
-        rel_path = metafunc.config.getoption("--config-list-file")
-        config_list_file = Path(rel_path).resolve()
-        config_dir = config_list_file.parent
-        with open(config_list_file, encoding="utf-8") as f:
-            configs = [
-                config_dir / line.strip()
-                for line in f
-                if line.strip() and not line.startswith("#")
-            ]
-        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -1,44 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on chartqa for vllm.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on ChartQA using multimodal vllm."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our correctness tests in vllm's CI."
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -l    - limit number of samples to run"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:l:t:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    t ) 
-        TP_SIZE="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm-vlm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
-  --tasks chartqa \
-  --batch_size auto \
-  --apply_chat_template \
-  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done

 lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -1,50 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
-  case ${OPT} in
-    m )
-        MODEL="$OPTARG"
-        ;;
-    b )
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l )
-        LIMIT="$OPTARG"
-        ;;
-    f )
-        FEWSHOT="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
-  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size auto
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,107 +1,69 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

-pytest -s -v test_lm_eval_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
 """

 import os
-from contextlib import contextmanager
+from pathlib import Path

 import lm_eval
-import numpy as np
+import numpy
+import pytest
 import yaml

-DEFAULT_RTOL = 0.08
+RTOL = 0.05
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


-@contextmanager
-def scoped_env_vars(new_env: dict[str, str]):
-    if not new_env:
-        # Fast path: nothing to do
-        yield
-        return
+def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)

-    old_values = {}
-    new_keys = []
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"add_bos_token=true," \
+                 f"trust_remote_code={trust_remote_code}"

-    try:
-        for key, value in new_env.items():
-            if key in os.environ:
-                old_values[key] = os.environ[key]
-            else:
-                new_keys.append(key)
-            os.environ[key] = str(value)
-        yield
-    finally:
-        # Restore / clean up
-        for key, value in old_values.items():
-            os.environ[key] = value
-        for key in new_keys:
-            os.environ.pop(key, None)
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto")

-
-def launch_lm_eval(eval_config, tp_size):
-    trust_remote_code = eval_config.get("trust_remote_code", False)
-    max_model_len = eval_config.get("max_model_len", 4096)
-    batch_size = eval_config.get("batch_size", "auto")
-    backend = eval_config.get("backend", "vllm")
-    enforce_eager = eval_config.get("enforce_eager", "true")
-    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
-    model_args = (
-        f"pretrained={eval_config['model_name']},"
-        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager={enforce_eager},"
-        f"kv_cache_dtype={kv_cache_dtype},"
-        f"add_bos_token=true,"
-        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len},"
-        "allow_deprecated_quantization=True,"
-    )
-
-    env_vars = eval_config.get("env_vars", None)
-    with scoped_env_vars(env_vars):
-        results = lm_eval.simple_evaluate(
-            model=backend,
-            model_args=model_args,
-            tasks=[task["name"] for task in eval_config["tasks"]],
-            num_fewshot=eval_config["num_fewshot"],
-            limit=eval_config["limit"],
-            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-            # text models. however, this is regressing measured strict-match for
-            # existing text models in CI, so only apply it for mm, or explicitly set
-            apply_chat_template=eval_config.get(
-                "apply_chat_template", backend == "vllm-vlm"
-            ),
-            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-            gen_kwargs=eval_config.get("gen_kwargs"),
-            batch_size=batch_size,
-        )
    return results


-def test_lm_eval_correctness_param(config_filename, tp_size):
-    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

-    results = launch_lm_eval(eval_config, tp_size)
+    if eval_config[
+            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
+        pytest.skip("FBGEMM is currently failing on main.")

-    rtol = eval_config.get("rtol", DEFAULT_RTOL)
+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)

+    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
-            print(
-                f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth:.3f} | "
-                f"measured={measured_value:.3f} | rtol={rtol}"
-            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            success = success and numpy.isclose(
+                ground_truth, measured_value, rtol=RTOL)

+    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -0,0 +1,143 @@
+# vLLM benchmark suite
+
+## Introduction
+
+This directory contains two sets of benchmark for vllm.
+
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
+
+See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+## Performance benchmark quick overview
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
+
+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
+## Trigger the benchmark
+
+Performance benchmark will be triggered when:
+- A PR being merged into vllm.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+
+Nightly benchmark will be triggered when:
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+
+## Performance benchmark details
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+
+### Latency test
+
+Here is an example of one test inside `latency-tests.json`:
+
+```json
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+]
+```
+
+In this example:
+
+- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+
+Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
+
+### Throughput test
+
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+
+The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
+
+### Serving test
+
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+
+```json
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+]
+```
+
+Inside this example:
+
+- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
+- The `server-parameters` includes the command line arguments for vLLM server.
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+
+The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+
+### Visualizing the results
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -0,0 +1,184 @@
+steps:
+  - label: "Wait for container to be ready"
+    key: wait-for-container-image
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: badouralix/curl-jq
+            command:
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+  - label: "Cleanup H100"
+    agents:
+      queue: H100
+    depends_on: ~
+    command: docker system prune -a --volumes --force
+  
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  # Premerge benchmark
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -0,0 +1,27 @@
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+  - Download `nightly-benchmarks.zip`.
+  - In the same folder, run the following code:
+
+  ```console
+  export HF_TOKEN=<your HF token>
+  apt update
+  apt install -y git
+  unzip nightly-benchmarks.zip
+  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  ```
+
+And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,39 @@
+
+# Nightly benchmark
+
+This benchmark aims to:
+
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+
+## Setup
+
+- Docker images:
+  - vLLM: `vllm/vllm-openai:v0.6.2`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+  - 8x Nvidia A100 GPUs
+- Workload:
+  - Dataset
+    - ShareGPT dataset
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+  - Models: llama-3 8B, llama-3 70B.
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+## Known issues
+
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,196 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+
+
+
+  - label: "A100 vllm step 10"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
+                <<: *common_container_settings
+
+
+
+
+  - label: "A100 trt llama-8B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"
+
+
+  - label: "A100 trt llama-70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
+        
+  - wait
+
+  - label: "Collect the results"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - block: ":rocket: check the results!"
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
@@ -1,12 +1,10 @@
-# Performance benchmarks descriptions

 ## Latency tests

 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

 {latency_tests_markdown_table}
@@ -16,8 +14,7 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.

 {throughput_tests_markdown_table}
@@ -28,18 +25,12 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
- CPU Models: llama-3.1 8B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.

 {serving_tests_markdown_table}

-## Platform Information
-
-{platform_markdown_table}
-
 ## json version of the benchmarking tables

 This section contains the data of the markdown tables above in JSON format.
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median latency (ms)",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99 latency (ms)",
+}
+
+# throughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    # "num_requests": "# of req.",
+    # "total_num_tokens": "Total # of tokens",
+    # "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    # "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    # "completed": "# of req.",
+    "request_throughput": "Tput (req/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
+}
+
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file) as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
+    if not latency_results.empty:
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    for df in [latency_results, serving_results, throughput_results]:
+        if df.empty:
+            continue
+
+        # Sort all dataframes by their respective "Test name" columns
+        df.sort_values(by="Test name", inplace=True)
+
+        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+        # we want to turn it into "8xGPUTYPE"
+        df["GPU"] = df["GPU"].apply(
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+                                "performance-benchmarks-descriptions.md")
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)
+
+    # document benchmarking results in json
+    with open(results_folder / "benchmark_results.json", "w") as f:
+
+        results = latency_results.to_dict(
+            orient='records') + throughput_results.to_dict(
+                orient='records') + serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model",
+                        type=str,
+                        required=True,
+                        help="Name of the model")
+    parser.add_argument("--cachedir",
+                        type=str,
+                        required=True,
+                        help="Directory to save the tokenizer")
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df['Test name'].str.contains(model)
+        target = target & df['Engine'].str.contains(method)
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)")
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
@@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"
+
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    
+
+
+    # The figures should be generated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -0,0 +1,462 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+    
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+  
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+        
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  pip install -r requirements/dev.txt
+  which genai-perf
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -0,0 +1,385 @@
+#!/bin/bash
+
+# This script should be run inside the CI process
+# This script assumes that we are already inside the vllm/ directory
+# Benchmarking results will be available inside vllm/benchmarks/results/
+
+# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
+# and we still want to see other benchmarking results even when mixtral crashes.
+set -x
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -X POST localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
+  else
+    echo "No processes found matching '$1'."
+  fi
+}
+
+kill_gpu_processes() {
+
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+
+
+  # wait until GPU memory usage smaller than 1GB
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
+}
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    latency_command="python3 benchmark_latency.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg latency "$latency_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        latency_command: $latency,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$latency_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    throughput_command="python3 benchmark_throughput.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+    server_pid=$!
+
+    # wait until the server is alive
+    if wait_for_server; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
+      client_command="python3 benchmark_serving.py \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --metadata "tensor_parallel_size=$tp" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_gpu_processes
+  done
+}
+
+main() {
+  check_gpus
+  check_hf_token
+
+  # Set to v1 to run v1 benchmark
+  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
+    export VLLM_USE_V1=1
+  fi
+
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
+
+  # get the current IP address, required by benchmark_serving.py
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOG_LEVEL="WARNING"
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  ensure_sharegpt_downloaded
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # benchmarking
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
+
+  # postprocess benchmarking results
+  pip install tabulate pandas
+  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+
+  upload_to_buildkite
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands")) as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+
+    serving_md_table_with_headers = tabulate(serving_results,
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write('\n')
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+else
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+fi
+
+TIMEOUT_SECONDS=10
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
@@ -11,7 +11,9 @@
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
@@ -35,7 +35,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@@ -88,7 +90,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@@ -141,7 +145,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@@ -191,7 +197,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@@ -243,7 +251,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@@ -295,7 +305,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -7,6 +7,7 @@
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@@ -25,6 +26,7 @@
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@@ -43,6 +45,7 @@
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
+            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@@ -57,14 +60,13 @@
        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
        "qps_list": [2],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -1,181 +0,0 @@
-# vLLM benchmark suite
-
-## Introduction
-
-This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
-vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
-
-## Performance benchmark quick overview
-
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
-
-**Benchmarking Duration**: about 1hr.
-
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
-
-## Trigger the benchmark
-
-The benchmark needs to be triggered manually:
-
-```bash
-bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
-```
-
-Runtime environment variables:
-
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
-
-## Performance benchmark details
-
-See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
-> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
-
-### Latency test
-
-Here is an example of one test inside `latency-tests.json`:
-
-```json
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-]
-```
-
-In this example:
-
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
-
-Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
-
-### Throughput test
-
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
-
-The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-
-### Serving test
-
-We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
-
-```json
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-]
-```
-
-Inside this example:
-
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
-
-The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-
-#### Default Parameters Field
-
-We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
-
-<details>
-<summary> An Example of default parameters field </summary>
-
-```json
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "server_environment_variables": {
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
-    },
-    "server_parameters": {
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "block_size": 128,
-      "disable_log_stats": "",
-      "load_format": "dummy"
-    },
-    "client_parameters": {
-      "backend": "vllm",
-      "dataset_name": "random",
-      "random-input-len": 128,
-      "random-output-len": 128,
-      "num_prompts": 200,
-      "ignore-eos": ""
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama3B_tp2_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 2,
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-      }
-    },
-    {
-      "test_name": "serving_qwen3_tp4_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-14B",
-        "tensor_parallel_size": 4,
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-14B",
-      }
-    },
-  ]
-}
-```
-
-</details>
-
-### Visualizing the results
-
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
-You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
-If you do not see the table, please wait till the benchmark finish running.
-The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
-The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
-
-#### Performance Results Comparison  
-
-Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -1,825 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import argparse
-import html as _html
-import json
-import os
-from dataclasses import dataclass
-from importlib import util
-
-import pandas as pd
-
-pd.options.display.float_format = "{:.2f}".format
-plotly_found = util.find_spec("plotly.express") is not None
-
-DEFAULT_INFO_COLS = [
-    "Model",
-    "Dataset Name",
-    "Input Len",
-    "Output Len",
-    #    "TP Size",
-    #    "PP Size",
-    "# of max concurrency.",
-    "qps",
-]
-
-# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
-pd.set_option("display.precision", 2)
-pd.set_option("display.float_format", lambda x: f"{x:.2f}")
-
-
-# -----------------------------
-# Core data compare
-# -----------------------------
-def compare_data_columns(
-    files: list[str],
-    name_column: str,
-    data_column: str,
-    info_cols: list[str],
-    drop_column: str,
-    debug: bool = False,
-):
-    """
-    Align concatenation by keys derived from info_cols instead of row order.
-    - Pick one canonical key list: subset of info_cols present in ALL files.
-    - For each file: set index to those keys, aggregate duplicates
-      (mean for metric, first for names).
-    - Concat along axis=1 (indexes align), then reset_index so callers can
-      group by columns.
-    - If --debug, add a <file_label>_name column per file.
-    """
-    print("\ncompare_data_column:", data_column)
-
-    frames = []
-    raw_data_cols: list[str] = []
-    compare_frames = []
-
-    cols_per_file: list[set] = []
-    for f in files:
-        try:
-            df_tmp = pd.read_json(f, orient="records")
-        except Exception as err:
-            raise ValueError(f"Failed to read {f}") from err
-        cols_per_file.append(set(df_tmp.columns))
-
-    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
-    if not key_cols:
-        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
-    if not key_cols:
-        raise ValueError(
-            "No common key columns found from info_cols across the input files."
-        )
-
-    meta_added = False
-
-    for file in files:
-        df = pd.read_json(file, orient="records")
-
-        if drop_column in df.columns:
-            df = df.dropna(subset=[drop_column], ignore_index=True)
-
-        for c in (
-            "Input Len",
-            "Output Len",
-            "TP Size",
-            "PP Size",
-            "# of max concurrency.",
-            "qps",
-        ):
-            if c in df.columns:
-                df[c] = pd.to_numeric(df[c], errors="coerce")
-
-        for c in key_cols:
-            if c not in df.columns:
-                df[c] = pd.NA
-
-        df_idx = df.set_index(key_cols, drop=False)
-
-        meta = df_idx[key_cols]
-        if not meta.index.is_unique:
-            meta = meta.groupby(level=key_cols, dropna=False).first()
-
-        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
-        if not s.index.is_unique:
-            s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label
-
-        if not meta_added:
-            frames.append(meta)
-            meta_added = True
-
-        if debug and name_column in df_idx.columns:
-            name_s = df_idx[name_column]
-            if not name_s.index.is_unique:
-                name_s = name_s.groupby(level=key_cols, dropna=False).first()
-            name_s.name = f"{file_label}_name"
-            frames.append(name_s)
-
-        frames.append(s)
-        raw_data_cols.append(file_label)
-        compare_frames.append(s)
-
-        if len(compare_frames) >= 2:
-            base = compare_frames[0]
-            current = compare_frames[-1]
-            if "P99" in data_column or "Median" in data_column:
-                ratio = base / current
-            else:
-                ratio = current / base
-            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
-            frames.append(ratio)
-
-    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
-
-    front = [c for c in info_cols if c in concat_df.columns]
-    rest = [c for c in concat_df.columns if c not in front]
-    concat_df = concat_df[front + rest]
-
-    print(raw_data_cols)
-    return concat_df, raw_data_cols
-
-
-# -----------------------------
-# Split helper
-# -----------------------------
-def split_json_by_tp_pp(
-    input_file: str = "benchmark_results.json", output_root: str = "."
-) -> list[str]:
-    with open(input_file, encoding="utf-8") as f:
-        data = json.load(f)
-
-    if isinstance(data, dict):
-        for key in ("results", "serving_results", "benchmarks", "data"):
-            if isinstance(data.get(key), list):
-                data = data[key]
-                break
-
-    df = pd.DataFrame(data)
-
-    name_col = next(
-        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
-    )
-    if name_col:
-        df = df[
-            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
-        ].copy()
-
-    rename_map = {
-        "tp_size": "TP Size",
-        "tensor_parallel_size": "TP Size",
-        "pp_size": "PP Size",
-        "pipeline_parallel_size": "PP Size",
-    }
-    df.rename(
-        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
-    )
-
-    if "TP Size" not in df.columns:
-        df["TP Size"] = 1
-    if "PP Size" not in df.columns:
-        df["PP Size"] = 1
-
-    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
-    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
-
-    saved_paths: list[str] = []
-    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
-        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
-        os.makedirs(folder_name, exist_ok=True)
-        filepath = os.path.join(folder_name, "benchmark_results.json")
-        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
-        print(f"Saved: {filepath}")
-        saved_paths.append(filepath)
-
-    return saved_paths
-
-
-# -----------------------------
-# Styling helpers
-# -----------------------------
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-    for c in df.columns:
-        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
-            return c
-    return "# of max concurrency."
-
-
-def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
-) -> pd.io.formats.style.Styler:
-    conc_col = _find_concurrency_col(df)
-    key_cols = [
-        c
-        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
-        if c in df.columns
-    ]
-    conf_cols = [
-        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
-    ]
-    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
-
-    return df.style.map(
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
-        if pd.notna(v) and v <= threshold
-        else "",
-        subset=conf_cols,
-    )
-
-
-def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
-    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
-    if not ratio_cols:
-        return styler
-
-    styler = styler.apply(
-        lambda _: ["background-color: #fff3b0"] * len(styler.data),
-        subset=ratio_cols,
-        axis=0,
-    )
-
-    styler = styler.set_table_styles(
-        [
-            {
-                "selector": f"th.col_heading.level0.col{i}",
-                "props": [("background-color", "#fff3b0")],
-            }
-            for i, col in enumerate(styler.data.columns)
-            if col in ratio_cols
-        ],
-        overwrite=False,
-    )
-    return styler
-
-
-def _apply_two_decimals(
-    styler: pd.io.formats.style.Styler,
-) -> pd.io.formats.style.Styler:
-    df = styler.data
-    num_cols = df.select_dtypes("number").columns
-    if len(num_cols) == 0:
-        return styler
-    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
-
-
-# -----------------------------
-# Valid max concurrency summary helpers
-# -----------------------------
-def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
-    key_cols = [
-        c
-        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
-        if c in df.columns
-    ]
-    exclude = set(key_cols + [conc_col, "qps", "QPS"])
-
-    cols: list[str] = []
-    for c in df.columns:
-        if c in exclude:
-            continue
-        lc = str(c).lower()
-        if lc.startswith("ratio"):
-            continue
-        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
-            continue
-        if pd.api.types.is_numeric_dtype(df[c]):
-            cols.append(c)
-    return cols
-
-
-def _max_concurrency_ok(
-    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
-):
-    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
-        return pd.NA
-
-    d = df[[conc_col, cfg_col]].copy()
-    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
-    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
-    d = d.dropna(subset=[conc_col, cfg_col])
-
-    if d.empty:
-        return pd.NA
-
-    ok = d[d[cfg_col] <= threshold]
-    if ok.empty:
-        return pd.NA
-
-    return ok[conc_col].max()
-
-
-def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
-    if (
-        df is None
-        or conc_col not in df.columns
-        or cfg_col not in df.columns
-        or pd.isna(conc_value)
-    ):
-        return pd.NA
-
-    d = df[[conc_col, cfg_col]].copy()
-    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
-    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
-
-    conc_value = pd.to_numeric(conc_value, errors="coerce")
-    if pd.isna(conc_value):
-        return pd.NA
-
-    hit = d[d[conc_col] == conc_value]
-    if hit.empty:
-        return pd.NA
-    return hit[cfg_col].iloc[0]
-
-
-def build_valid_max_concurrency_summary_html(
-    tput_group_df: pd.DataFrame | None,
-    ttft_group_df: pd.DataFrame | None,
-    tpot_group_df: pd.DataFrame | None,
-    conc_col: str,
-    args,
-) -> str:
-    if ttft_group_df is None and tpot_group_df is None:
-        return ""
-
-    ttft_cols = (
-        _config_value_columns(ttft_group_df, conc_col)
-        if ttft_group_df is not None
-        else []
-    )
-    tpot_cols = (
-        _config_value_columns(tpot_group_df, conc_col)
-        if tpot_group_df is not None
-        else []
-    )
-    tput_cols = (
-        _config_value_columns(tput_group_df, conc_col)
-        if tput_group_df is not None
-        else []
-    )
-
-    if ttft_group_df is not None and tpot_group_df is not None:
-        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
-        if tput_group_df is not None:
-            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
-    else:
-        cfg_cols = ttft_cols or tpot_cols
-
-    if not cfg_cols:
-        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
-
-    rows = []
-    for cfg in cfg_cols:
-        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-        both = (
-            pd.NA
-            if (pd.isna(ttft_max) or pd.isna(tpot_max))
-            else min(ttft_max, tpot_max)
-        )
-
-        tput_at_both = (
-            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
-            if tput_group_df is not None
-            else pd.NA
-        )
-        ttft_at_both = (
-            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_at_both = (
-            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-
-        rows.append(
-            {
-                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
-                f"Max {conc_col} (Both)": both,
-                "Output Tput @ Both (tok/s)": tput_at_both,
-                "TTFT @ Both (ms)": ttft_at_both,
-                "TPOT @ Both (ms)": tpot_at_both,
-            }
-        )
-
-    summary_df = pd.DataFrame(rows)
-
-    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
-    for c in summary_df.columns:
-        if c == "Configuration":
-            continue
-        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
-
-    both_col = f"Max {conc_col} (Both)"
-
-    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
-    formatters = {}
-    for c in summary_df.columns:
-        if c == "Configuration":
-            continue
-        # default argument binds per-column formatter correctly
-        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
-
-    styler = summary_df.style.format(formatters)
-
-    def _green(v):
-        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
-
-    if both_col in summary_df.columns:
-        styler = styler.map(_green, subset=[both_col])
-
-    title = (
-        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
-        "Valid Max Concurrency Summary"
-        "</div>\n"
-    )
-    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
-
-
-# -----------------------------
-# Plot helper
-# -----------------------------
-def _add_limit_line(fig, y_value: float, label: str):
-    fig.add_hline(
-        y=y_value,
-        line_dash="dash",
-        line_color="red" if "ttft" in label.lower() else "blue",
-        annotation_text=f"{label}: {y_value} ms",
-        annotation_position="top left",
-    )
-    if plotly_found:
-        import plotly.graph_objects as go
-
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="lines",
-                line=dict(
-                    dash="dash",
-                    color="red" if "ttft" in label.lower() else "blue",
-                ),
-                name=label,
-            )
-        )
-
-
-# -----------------------------
-# Refactored main + group-first report
-# -----------------------------
-@dataclass(frozen=True)
-class MetricPlan:
-    data_cols: list[str]
-    drop_column: str
-
-
-def build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-f", "--file", action="append", type=str, help="input file name"
-    )
-    parser.add_argument(
-        "--debug", action="store_true", help="show all information for debugging"
-    )
-    parser.add_argument(
-        "--plot",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="plot perf diagrams or not --no-plot --plot",
-    )
-    parser.add_argument(
-        "-x",
-        "--xaxis",
-        type=str,
-        default="# of max concurrency.",
-        help="column name to use as X Axis in comparison graph",
-    )
-    parser.add_argument(
-        "-l",
-        "--latency",
-        type=str,
-        default="p99",
-        help="take median|p99 for latency like TTFT/TPOT",
-    )
-    parser.add_argument(
-        "--ttft-max-ms",
-        type=float,
-        default=3000.0,
-        help="Reference limit for TTFT plots (ms)",
-    )
-    parser.add_argument(
-        "--tpot-max-ms",
-        type=float,
-        default=100.0,
-        help="Reference limit for TPOT plots (ms)",
-    )
-    return parser
-
-
-def choose_metrics(latency: str) -> MetricPlan:
-    latency = (latency or "").lower()
-    drop_column = "P99"
-
-    if "median" in latency:
-        return MetricPlan(
-            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
-            drop_column=drop_column,
-        )
-
-    return MetricPlan(
-        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
-        drop_column=drop_column,
-    )
-
-
-def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
-    if not args.file:
-        raise ValueError("No input files provided. Use -f/--file.")
-
-    if len(args.file) == 1:
-        files = split_json_by_tp_pp(args.file[0], output_root="splits")
-        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
-    else:
-        files = args.file
-
-    return files, info_cols
-
-
-def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
-    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
-    return info_cols[y_axis_index]
-
-
-def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
-    filtered_info_cols = info_cols[:4]
-    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
-    if not group_cols:
-        raise ValueError(
-            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
-            f"but DataFrame has: {list(output_df.columns)}"
-        )
-    return group_cols
-
-
-def normalize_group_key(name):
-    return name if isinstance(name, tuple) else (name,)
-
-
-def group_filename(name, prefix: str = "perf_comparison_") -> str:
-    name_vals = normalize_group_key(name)
-    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
-    return f"{prefix}{safe}.html"
-
-
-def build_group_suffix(group_cols: list[str], name) -> str:
-    name_vals = normalize_group_key(name)
-    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
-
-
-def render_metric_table_html(
-    display_group: pd.DataFrame,
-    metric_label: str,
-    group_suffix: str,
-    args,
-) -> str:
-    title = (
-        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
-        f"{_html.escape(metric_label)}"
-        f" — {_html.escape(group_suffix)}"
-        f"</div>\n"
-    )
-
-    metric_name = metric_label.lower()
-    if "ttft" in metric_name:
-        styler = _highlight_threshold(display_group, args.ttft_max_ms)
-    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(display_group, args.tpot_max_ms)
-    else:
-        styler = display_group.style
-
-    styler = _apply_two_decimals(styler)
-    styler = highlight_ratio_columns(styler)
-
-    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
-
-
-def maybe_write_plot(
-    main_fh,
-    sub_fh,
-    group_df: pd.DataFrame,
-    raw_data_cols: list[str],
-    metric_label: str,
-    y_axis_col: str,
-    args,
-):
-    if not (args.plot and plotly_found):
-        return
-
-    import plotly.express as px
-
-    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
-    df_melted = df.melt(
-        id_vars=y_axis_col,
-        var_name="Configuration",
-        value_name=metric_label,
-    )
-
-    fig = px.line(
-        df_melted,
-        x=y_axis_col,
-        y=metric_label,
-        color="Configuration",
-        title=f"{metric_label} vs {y_axis_col}",
-        markers=True,
-    )
-
-    # Ensure plot hover + y tick labels are also 2 decimals.
-    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
-    fig.update_yaxes(tickformat=".2f")
-
-    metric_name = metric_label.lower()
-    if "ttft" in metric_name:
-        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
-    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
-    main_fh.write(html)
-    sub_fh.write(html)
-
-
-def build_group_keys(
-    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
-):
-    if sort_cols:
-        df = df.sort_values(by=sort_cols)
-    gb = df.groupby(group_cols, dropna=False)
-    return [k for k, _ in gb]
-
-
-def write_report_group_first(
-    files: list[str], info_cols: list[str], plan: MetricPlan, args
-):
-    name_column = "Test name"
-    y_axis_col = get_y_axis_col(info_cols, args.xaxis)
-
-    print("comparing : " + ", ".join(files))
-
-    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
-    group_cols_canonical: list[str] | None = None
-
-    for metric_label in plan.data_cols:
-        output_df, raw_data_cols = compare_data_columns(
-            files,
-            name_column,
-            metric_label,
-            info_cols,
-            plan.drop_column,
-            debug=args.debug,
-        )
-
-        raw_data_cols = list(raw_data_cols)
-        raw_data_cols.insert(0, y_axis_col)
-
-        group_cols = get_group_cols(output_df, info_cols)
-        if group_cols_canonical is None:
-            group_cols_canonical = group_cols
-        else:
-            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
-
-        metric_cache[metric_label] = (
-            output_df.sort_values(by=args.xaxis),
-            raw_data_cols,
-        )
-
-    if not group_cols_canonical:
-        raise ValueError("No canonical group columns found across metrics.")
-
-    first_metric = plan.data_cols[0]
-    first_df_sorted, _ = metric_cache[first_metric]
-    group_keys = build_group_keys(
-        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
-    )
-
-    metric_groupbys = {
-        metric_label: df.groupby(group_cols_canonical, dropna=False)
-        for metric_label, (df, _) in metric_cache.items()
-    }
-
-    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-        main_fh.write('<meta charset="utf-8">\n')
-        for gkey in group_keys:
-            gkey_tuple = normalize_group_key(gkey)
-            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-            sub_path = group_filename(gkey_tuple)
-            group_header = (
-                '<div style="font-size: 1.4em; font-weight: 700; '
-                'margin: 18px 0 10px 0;">'
-                f"{_html.escape(suffix)}"
-                "</div>\n"
-            )
-
-            main_fh.write(group_header)
-            with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                sub_fh.write('<meta charset="utf-8">\n')
-                sub_fh.write(group_header)
-                tput_group_df = None
-                ttft_group_df = None
-                tpot_group_df = None
-                conc_col = args.xaxis
-
-                for metric_label in plan.data_cols:
-                    gb = metric_groupbys[metric_label]
-                    df_sorted, raw_data_cols = metric_cache[metric_label]
-
-                    try:
-                        group_df = gb.get_group(gkey)
-                    except KeyError:
-                        missing = (
-                            '<div style="font-size: 1.1em; font-weight: 600; '
-                            'margin: 10px 0;">'
-                            f"{_html.escape(metric_label)} — missing for this group"
-                            "</div>\n"
-                        )
-
-                        main_fh.write(missing)
-                        sub_fh.write(missing)
-                        continue
-
-                    if conc_col not in group_df.columns:
-                        conc_col = _find_concurrency_col(group_df)
-
-                    mn = metric_label.lower().strip()
-                    if "tok/s" in mn:
-                        tput_group_df = group_df
-                    elif "ttft" in mn:
-                        ttft_group_df = group_df
-                    elif mn in ("p99", "median") or "tpot" in mn:
-                        tpot_group_df = group_df
-
-                    display_group = group_df.drop(
-                        columns=group_cols_canonical, errors="ignore"
-                    )
-
-                    html = render_metric_table_html(
-                        display_group, metric_label, suffix, args
-                    )
-                    main_fh.write(html)
-                    sub_fh.write(html)
-
-                    maybe_write_plot(
-                        main_fh,
-                        sub_fh,
-                        group_df=group_df,
-                        raw_data_cols=raw_data_cols,
-                        metric_label=metric_label,
-                        y_axis_col=y_axis_col,
-                        args=args,
-                    )
-
-                summary_html = build_valid_max_concurrency_summary_html(
-                    tput_group_df=tput_group_df,
-                    ttft_group_df=ttft_group_df,
-                    tpot_group_df=tpot_group_df,
-                    conc_col=conc_col,
-                    args=args,
-                )
-                if summary_html:
-                    main_fh.write(summary_html)
-                    sub_fh.write(summary_html)
-
-
-def main():
-    args = build_parser().parse_args()
-    info_cols = list(DEFAULT_INFO_COLS)
-    plan = choose_metrics(args.latency)
-    files, info_cols = prepare_input_files(args, info_cols)
-    write_report_group_first(files, info_cols, plan, args)
-
-
-if __name__ == "__main__":
-    main()
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,414 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import json
-import os
-import shlex
-from importlib import util
-from pathlib import Path
-from typing import Any
-
-import pandas as pd
-import psutil
-import regex as re
-from tabulate import tabulate
-
-# latency results and the keys that will be printed into markdown
-latency_results = []
-latency_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "avg_latency": "Mean latency (ms)",
-    # "P10": "P10 (s)",
-    # "P25": "P25 (s)",
-    "P50": "Median latency (ms)",
-    # "P75": "P75 (s)",
-    # "P90": "P90 (s)",
-    "P99": "P99 latency (ms)",
-}
-
-# throughput tests and the keys that will be printed into markdown
-throughput_results = []
-throughput_results_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
-    "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
-}
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "model_id": "Model",
-    "dataset_name": "Dataset Name",
-    "input_len": "Input Len",
-    "output_len": "Output Len",
-    "tp_size": "TP Size",
-    "pp_size": "PP Size",
-    "dtype": "dtype",
-    "gpu_type": "GPU",
-    "completed": "# of req.",
-    "qps": "qps",
-    "max_concurrency": "# of max concurrency.",
-    "request_throughput": "Tput (req/s)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    # "total_input_tokens": "Total input tokens",
-    # "total_output_tokens": "Total output tokens",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "p99_ttft_ms": "P99 TTFT (ms)",
-    "std_ttft_ms": "STD TTFT (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
-    "std_tpot_ms": "STD TPOT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "p99_itl_ms": "P99 ITL (ms)",
-}
-
-
-def read_markdown(file):
-    if os.path.exists(file):
-        with open(file) as f:
-            return f.read() + "\n"
-    else:
-        return f"{file} not found.\n"
-
-
-def results_to_json(latency, throughput, serving):
-    return json.dumps(
-        {
-            "latency": latency.to_dict(),
-            "throughput": throughput.to_dict(),
-            "serving": serving.to_dict(),
-        }
-    )
-
-
-def get_size_with_unit(bytes, suffix="B"):
-    """
-    Scale bytes to its proper format
-    e.g:
-        1253656 => '1.20MB'
-        1253656678 => '1.17GB'
-    """
-    factor = 1024
-    for unit in ["", "K", "M", "G", "T", "P"]:
-        if bytes < factor:
-            return f"{bytes:.2f}{unit}{suffix}"
-        bytes /= factor
-
-
-def _coerce(val: str) -> Any:
-    """Best-effort type coercion from string to Python types."""
-    low = val.lower()
-    if low == "null":
-        return None
-    if low == "true":
-        return True
-    if low == "false":
-        return False
-    # integers
-    if re.fullmatch(r"[+-]?\d+", val):
-        try:
-            return int(val)
-        except ValueError:
-            pass
-    # floats (keep 'inf'/'-inf'/'nan' as strings)
-    if re.fullmatch(r"[+-]?\d*\.\d+", val):
-        try:
-            return float(val)
-        except ValueError:
-            pass
-    return val
-
-
-def parse_client_command(cmd: str) -> dict[str, Any]:
-    """Parse the client_command shell string into {executable, script, args}."""
-    toks = shlex.split(cmd)
-    if len(toks) < 2:
-        raise ValueError("client_command must include an executable and a script")
-    executable, script = toks[0], toks[1]
-    args: dict[str, Any] = {}
-
-    i = 2
-    while i < len(toks):
-        t = toks[i]
-        if t.startswith("--"):
-            # --key=value or --key (value) or boolean flag
-            if "=" in t:
-                key, val = t.split("=", 1)
-                if key == "--metadata":
-                    md = {}
-                    if val:
-                        if "=" in val:
-                            k, v = val.split("=", 1)
-                            md[k] = _coerce(v)
-                        else:
-                            md[val] = True
-                    args[key] = md
-                else:
-                    args[key] = _coerce(val)
-                i += 1
-                continue
-
-            key = t
-
-            # Special: consume metadata k=v pairs until next --flag
-            if key == "--metadata":
-                i += 1
-                md = {}
-                while i < len(toks) and not toks[i].startswith("--"):
-                    pair = toks[i]
-                    if "=" in pair:
-                        k, v = pair.split("=", 1)
-                        md[k] = _coerce(v)
-                    else:
-                        md[pair] = True
-                    i += 1
-                args[key] = md
-                continue
-
-            # Standard: check if next token is a value (not a flag)
-            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
-                args[key] = _coerce(toks[i + 1])
-                i += 2
-            else:
-                # lone flag -> True
-                args[key] = True
-                i += 1
-        else:
-            # unexpected positional; skip
-            i += 1
-
-    return {"executable": executable, "script": script, "args": args}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-r",
-        "--result",
-        type=str,
-        default="results",
-        help="Folder name for benchmark output results.",
-    )
-    args = parser.parse_args()
-    results_folder = Path(args.result)
-    if not results_folder.exists():
-        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-        with open(test_file) as f:
-            raw_result = json.loads(f.read())
-
-        if "serving" in str(test_file):
-            # this result is generated via `vllm bench serve` command
-            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-            # Parse Server Command Arg
-            out: dict[str, Any] = {
-                "server_command": parse_client_command(command["server_command"])
-            }
-            parse_args = [
-                "--tensor-parallel-size",
-                "--pipeline-parallel-size",
-                "--dtype",
-            ]
-            col_mapping = ["tp_size", "pp_size", "dtype"]
-            for index, arg in enumerate(parse_args):
-                if arg in out["server_command"]["args"]:
-                    raw_result.update(
-                        {col_mapping[index]: out["server_command"]["args"][arg]}
-                    )
-
-            # Parse Client Command Arg
-            out: dict[str, Any] = {
-                "client_command": parse_client_command(command["client_command"])
-            }
-            parse_args = [
-                "--dataset-name",
-                "--random-input-len",
-                "--random-output-len",
-                "--request-rate",
-            ]
-            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
-
-            for index, arg in enumerate(parse_args):
-                if arg in out["client_command"]["args"]:
-                    raw_result.update(
-                        {col_mapping[index]: out["client_command"]["args"][arg]}
-                    )
-            # Add Server, Client command
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-            # add the result to raw_result
-            serving_results.append(raw_result)
-            continue
-
-        elif "latency" in f.name:
-            # this result is generated via `vllm bench latency` command
-
-            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # get different percentiles
-            for perc in [10, 25, 50, 75, 90, 99]:
-                # Multiply 1000 to convert the time unit from s to ms
-                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
-                )
-            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
-
-            # add the result to raw_result
-            latency_results.append(raw_result)
-            continue
-
-        elif "throughput" in f.name:
-            # this result is generated via `vllm bench throughput` command
-
-            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # add the result to raw_result
-            throughput_results.append(raw_result)
-            continue
-
-        print(f"Skipping {test_file}")
-
-    latency_results = pd.DataFrame.from_dict(latency_results)
-    serving_results = pd.DataFrame.from_dict(serving_results)
-    throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-    svmem = psutil.virtual_memory()
-    platform_data = {
-        "Physical cores": [psutil.cpu_count(logical=False)],
-        "Total cores": [psutil.cpu_count(logical=True)],
-        "Total Memory": [get_size_with_unit(svmem.total)],
-    }
-
-    if util.find_spec("numa") is not None:
-        from numa import info
-
-        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
-
-    if util.find_spec("cpuinfo") is not None:
-        from cpuinfo import get_cpu_info
-
-        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
-
-    platform_results = pd.DataFrame.from_dict(
-        platform_data, orient="index", columns=["Platform Info"]
-    )
-
-    raw_results_json = results_to_json(
-        latency_results, throughput_results, serving_results
-    )
-
-    # remapping the key, for visualization purpose
-    if not latency_results.empty:
-        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
-            columns=latency_column_mapping
-        )
-    if not serving_results.empty:
-        valid_columns = [
-            col for col in serving_column_mapping if col in serving_results.columns
-        ]
-        serving_results = serving_results[valid_columns].rename(
-            columns=serving_column_mapping
-        )
-    if not throughput_results.empty:
-        throughput_results = throughput_results[
-            list(throughput_results_column_mapping.keys())
-        ].rename(columns=throughput_results_column_mapping)
-
-    processed_results_json = results_to_json(
-        latency_results, throughput_results, serving_results
-    )
-
-    for df in [latency_results, serving_results, throughput_results]:
-        if df.empty:
-            continue
-
-        # Sort all dataframes by their respective "Test name" columns
-        df.sort_values(by="Test name", inplace=True)
-
-        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
-        # we want to turn it into "8xGPUTYPE"
-        df["GPU"] = df["GPU"].apply(
-            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
-        )
-
-    # get markdown tables
-    latency_md_table = tabulate(
-        latency_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    serving_md_table = tabulate(
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    throughput_md_table = tabulate(
-        throughput_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    platform_md_table = tabulate(
-        platform_results, headers="keys", tablefmt="pipe", showindex=True
-    )
-
-    # document the result
-    md_file = "benchmark_results.md"
-    json_file = "benchmark_results.json"
-    with open(results_folder / md_file, "w") as f:
-        results = read_markdown(
-            "../.buildkite/performance-benchmarks/"
-            "performance-benchmarks-descriptions.md"
-        )
-        results = results.format(
-            latency_tests_markdown_table=latency_md_table,
-            throughput_tests_markdown_table=throughput_md_table,
-            serving_tests_markdown_table=serving_md_table,
-            platform_markdown_table=platform_md_table,
-            benchmarking_results_in_json_string=processed_results_json,
-        )
-        f.write(results)
-
-    # document benchmarking results in json
-    with open(results_folder / json_file, "w") as f:
-        results = (
-            latency_results.to_dict(orient="records")
-            + throughput_results.to_dict(orient="records")
-            + serving_results.to_dict(orient="records")
-        )
-        f.write(json.dumps(results))
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,498 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the CI process
-# This script assumes that we are already inside the vllm/ directory
-# Benchmarking results will be available inside vllm/benchmarks/results/
-
-# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
-# and we still want to see other benchmarking results even when mixtral crashes.
-set -x
-set -o pipefail
-
-check_gpus() {
-  if command -v nvidia-smi; then
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
-  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
-  fi
-
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-
-  declare -g arch_suffix=''
-
-  if command -v nvidia-smi; then
-    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
-  elif command -v amd-smi; then
-    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
-  elif command -v hl-smi; then
-    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
-    arch_suffix='-hpu'
-  fi
-  echo "GPU type is $gpu_type"
-}
-
-check_cpus() {
-  # check the number of CPUs and NUMA Node and GPU type.
-  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
-  if [[ $numa_count -gt 0 ]]; then
-    echo "NUMA found."
-    echo $numa_count
-  else
-    echo "Need at least 1 NUMA to run benchmarking."
-    exit 1
-  fi
-  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
-    declare -g gpu_type="arm64-cpu"
-  else
-    declare -g gpu_type="cpu"
-  fi
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-ensure_sharegpt_downloaded() {
-  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
-  if [ ! -f "$FILE" ]; then
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
-  else
-    echo "$FILE already exists."
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-json2envs() {
-  # transforms the JSON string to environment variables.
-  # example:
-  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
-  # output: VLLM_CPU_KVCACHE_SPACE=5
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map((.key ) + "=" + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  local timeout_val="1200"
-  timeout "$timeout_val" bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-kill_processes_launched_by_current_bash() {
-  # Kill all python processes launched from current bash script
-  current_shell_pid=$$
-  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
-  if [ -n "$processes" ]; then
-    echo "Killing the following processes matching '$1':"
-    echo "$processes"
-    echo "$processes" | xargs kill -9
-  else
-    echo "No processes found matching '$1'."
-  fi
-}
-
-kill_gpu_processes() {
-
-  ps -aux
-  lsof -t -i:8000 | xargs -r kill -9
-  pgrep python3 | xargs -r kill -9
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pgrep VLLM | xargs -r kill -9
-
-  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v amd-smi; then
-    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v hl-smi; then
-    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
-  if command -v buildkite-agent >/dev/null 2>&1; then
-    BUILDKITE_AGENT_COMMAND="buildkite-agent"
-  elif [ -f /workspace/buildkite-agent ]; then
-    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
-  else
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-
-  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
-  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
-}
-
-run_benchmark_tests() {
-  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: test type (latency or throughput)
-  # $2: a json file specifying test cases
-
-  local test_type=$1
-  local test_file=$2
-
-  # Iterate over tests
-  jq -c '.[]' "$test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    bench_params=$(echo "$params" | jq -r '.parameters')
-    bench_args=$(json2args "$bench_params")
-    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    bench_envs=$(json2envs "$bench_environment_variables")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-
-    bench_command=" $bench_envs vllm bench $test_type \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $bench_args"
-
-    echo "Running test case $test_name"
-    echo "${test_type^} command: $bench_command"
-
-    # recording benchmarking command and GPU command
-    jq_output=$(jq -n \
-      --arg command "$bench_command" \
-      --arg gpu "$gpu_type" \
-      --arg test_type "$test_type" \
-      '{
-        ($test_type + "_command"): $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$bench_command"
-
-    kill_gpu_processes
-
-  done
-}
-
-run_latency_tests() {
-  run_benchmark_tests "latency" "$1"
-}
-
-run_startup_tests() {
-  run_benchmark_tests "startup" "$1"
-}
-
-run_throughput_tests() {
-  run_benchmark_tests "throughput" "$1"
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '
-    if type == "array" then
-      # Plain format: test cases array
-      .[]
-    elif (type == "object" and has("tests")) then
-      # merge the default parameters into each test cases
-      . as $root
-      | ($root.defaults // {}) as $d
-      | ($root.tests // [])[]
-      # default qps / max_concurrency from defaults if missing
-      | .qps_list = (.qps_list // $d.qps_list)
-      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
-      # merge envs / params: test overrides defaults
-      | .server_environment_variables =
-          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
-      | .server_parameters =
-          (($d.server_parameters // {}) + (.server_parameters // {}))
-      | .client_parameters =
-          (($d.client_parameters // {}) + (.client_parameters // {}))
-    else
-      error("Unsupported serving test file format: must be array or object with .tests")
-    end
-  ' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^serving_ ]]; then
-      echo "In serving-test.json, test_name must start with \"serving_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get client and server arguments (after merged the default parameters)
-    server_params=$(echo "$params" | jq -r '.server_parameters')
-    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
-    client_params=$(echo "$params" | jq -r '.client_parameters')
-
-    server_args=$(json2args "$server_params")
-    server_envs=$(json2envs "$server_envs")
-    client_args=$(json2args "$client_params")
-
-    # qps_list
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # max_concurrency_list (fallback to num_prompts if missing)
-    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
-    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-      max_concurrency_list="[$num_prompts]"
-    fi
-    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
-    echo "Running over max concurrency list $max_concurrency_list"
-
-    # check if there is enough resources to run the test
-    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-
-    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
-    client_model=$(echo "$client_params" | jq -r '.model')
-    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $test_name."
-      continue
-    fi
-
-    server_command="$server_envs vllm serve \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    # support remote vllm server
-    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" ]]; then
-      bash -c "$server_command" &
-      server_pid=$!
-      # wait until the server is alive
-      if wait_for_server; then
-        echo ""
-        echo "vLLM server is up and running."
-      else
-        echo ""
-        echo "vLLM failed to start within the timeout period."
-      fi
-    else
-      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
-      if [[ ${REMOTE_PORT} ]]; then
-        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
-      else
-        client_remote_args=" --host=$REMOTE_HOST "
-      fi
-    fi
-
-    # save the compilation mode and optimization level on the serving results
-    # whenever they are set
-    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
-    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      # iterate over different max_concurrency
-      for max_concurrency in $max_concurrency_list; do
-        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
-        echo " new test name $new_test_name"
-        # pass the tensor parallel size, the compilation mode, and the optimization
-        # level to the client so that they can be used on the benchmark dashboard
-        client_command="vllm bench serve \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --max-concurrency $max_concurrency \
-          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
-
-        echo "Running test case $test_name with qps $qps"
-        echo "Client command: $client_command"
-
-        bash -c "$client_command"
-
-        # record the benchmarking commands
-        jq_output=$(jq -n \
-          --arg server "$server_command" \
-          --arg client "$client_command" \
-          --arg gpu "$gpu_type" \
-          '{
-            server_command: $server,
-            client_command: $client,
-            gpu_type: $gpu
-          }')
-        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-      done
-    done
-
-    # clean up
-    kill -9 $server_pid
-    kill_gpu_processes
-  done
-}
-
-main() {
-  local ARCH
-  ARCH=''
-  if [[ "$ON_CPU" == "1" ]]; then
-    check_cpus
-    ARCH="-$gpu_type"
-  else
-     check_gpus
-     ARCH="$arch_suffix"
-  fi
-  check_hf_token
-
-  # dependencies
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get update && apt-get -y install jq)
-  (which lsof) || (apt-get update && apt-get install -y lsof)
-
-  # get the current IP address, required by `vllm bench serve` command
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOGGING_LEVEL="WARNING"
-
-  # prepare for benchmarking
-  cd benchmarks || exit 1
-  ensure_sharegpt_downloaded
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
-
-  # dump vllm info via vllm collect-env
-  env_output=$(vllm collect-env)
-
-  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
-
-  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
-  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
-  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
-
-  # postprocess benchmarking results
-  pip install tabulate pandas
-  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-
-  upload_to_buildkite
-}
-
-main "$@"
--- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -1,26 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 128,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "enforce_eager": "",
-            "max_num_batched_tokens": 2048,
-            "max_num_seqs": 256,
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@@ -1,26 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -1,55 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15,
-            "max-model-len": 256,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "latency_llama70B_tp4",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15,
-            "max-model-len": 256,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "latency_mixtral8x7B_tp2",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15,
-            "max-model-len": 256,
-            "async-scheduling": ""
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -1,130 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      12,
-      16,
-      24,
-      32,
-      64,
-      128,
-      200
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "enforce_eager": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256,
-      "load_format": "dummy"
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,283 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -1,82 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "max-num-seqs": 256,
-            "async-scheduling": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "max-num-seqs": 256,
-            "async-scheduling": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "server_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy",
-            "max-model-len": 2048,
-            "max-num-seqs": 256,
-            "async-scheduling": ""
-        },
-        "client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -1,27 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-            "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dtype": "bfloat16",
-            "distributed_executor_backend": "mp",
-            "block_size": 128,
-            "trust_remote_code": "",
-            "disable_log_stats": "",
-            "enforce_eager": "",
-            "max_num_batched_tokens": 2048,
-            "max_num_seqs": 256,
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@@ -1,27 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -1,61 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 512,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "throughput_llama70B_tp4",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 512,
-            "async-scheduling": ""
-        }
-    },
-    {
-        "test_name": "throughput_mixtral8x7B_tp2",
-        "environment_variables": {
-            "PT_HPU_LAZY_MODE": 1,
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
-            "VLLM_CONTIGUOUS_PA": 1,
-            "VLLM_DEFRAG": 1
-        },
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
-            "backend": "vllm",
-            "max-model-len": 2048,
-            "max-num-seqs": 512,
-            "async-scheduling": ""
-        }
-    }
-]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,734 +1,88 @@
 steps:
+  - label: "Build wheel - CUDA 12.4"
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - label: "Build wheel - CUDA 12.1"
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    # depends_on: block-build-cu118-wheel
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
+
  - input: "Provide Release version here"
-    id: input-release-version
    fields:
      - text: "What is the release version?"
-        key: release-version
+        key: "release-version"

-  - group: "Build Python wheels"
-    key: "build-wheels"
-    steps:
-      - label: "Build wheel - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-12-9
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - aarch64 - CPU"
-        depends_on: ~
-        id: build-wheel-arm64-cpu
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-wheel-x86-cuda-12-9
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-wheel-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - label: "Build wheel - x86_64 - CPU"
-        depends_on: ~
-        id: build-wheel-x86-cpu
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-  - group: "Build release Docker images"
-    key: "build-release-images"
-    steps:
-      - label: "Build release image - x86_64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-x86
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-      - label: "Build release image - aarch64 - CUDA 12.9"
-        depends_on: ~
-        id: build-release-image-arm64
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-      - label: "Build release image - x86_64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-x86-cuda-13-0
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-          # re-tag to default image tag and push, just in case arm64 build fails
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Build release image - aarch64 - CUDA 13.0"
-        depends_on: ~
-        id: build-release-image-arm64-cuda-13-0
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
-
-      - block: "Build release image for x86_64 CPU"
-        key: block-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - x86_64 - CPU"
-        depends_on:
-          - block-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-      - block: "Build release image for arm64 CPU"
-        key: block-arm64-cpu-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - arm64 - CPU"
-        depends_on: 
-          - block-arm64-cpu-release-image-build
-          - input-release-version
-        agents:
-          queue: arm64_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-        env:
-          DOCKER_BUILDKIT: "1"
-
-  - group: "Publish release images"
-    key: "publish-release-images"
-    steps:
-      - label: "Create multi-arch manifest - CUDA 12.9"
-        depends_on:
-          - build-release-image-x86
-          - build-release-image-arm64
-        id: create-multi-arch-manifest
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-      - label: "Annotate release workflow - CUDA 12.9"
-        depends_on:
-          - create-multi-arch-manifest
-        id: annotate-release-workflow
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/annotate-release.sh"
-
-      - label: "Create multi-arch manifest - CUDA 13.0"
-        depends_on:
-          - build-release-image-x86-cuda-13-0
-          - build-release-image-arm64-cuda-13-0
-        id: create-multi-arch-manifest-cuda-13-0
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
-          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
-      - label: "Publish nightly multi-arch image to DockerHub"
-        depends_on:
-          - create-multi-arch-manifest
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
-        depends_on:
-          - create-multi-arch-manifest-cuda-13-0
-        if: build.env("NIGHTLY") == "1"
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
-          # Clean up old nightly builds (keep only last 14)
-          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-  - group: "Publish release artifacts"
-    key: "publish-release-artifacts"
-    steps:
-      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
-        key: block-upload-release-wheels
-        depends_on:
-          - input-release-version
-          - build-wheels
-
-      - label: "Upload release wheels to PyPI"
-        depends_on:
-          - block-upload-release-wheels
-        id: upload-release-wheels
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
-      
-      - block: "Confirm update release images to DockerHub"
-        key: block-update-release-images-dockerhub
-        depends_on:
-          - input-release-version
-          - annotate-release-workflow
-      
-      - label: "Publish release images to DockerHub"
-        depends_on:
-          - block-update-release-images-dockerhub
-        agents:
-          queue: small_cpu_queue_postmerge
-        commands:
-          - "bash .buildkite/scripts/push-release-images-dockerhub.sh"
-        plugins:
-          - docker-login#v3.0.0:
-              username: vllmbot
-              password-env: DOCKERHUB_TOKEN
-        env:
-          DOCKER_BUILDKIT: "1"
-          DOCKERHUB_USERNAME: "vllmbot"
-
-  # =============================================================================
-  # ROCm Release Pipeline (x86_64 only)
-  # =============================================================================
-  #
-  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
-  # To build a specific version, trigger the build from that branch/tag.
-  #
-  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
-  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
-  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
-  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
-  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
-  #
-  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
-  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
-  #
-  # =============================================================================
-
-  # ROCm Input Step - Collect build configuration (manual trigger only)
-  - input: "ROCm Wheel Release Build Configuration"
-    key: input-rocm-config
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
    depends_on: ~
-    if: build.source == "ui"
-    fields:
-      - text: "Python Version"
-        key: "rocm-python-version"
-        default: "3.12"
-        hint: "Python version (e.g., 3.12)"
-      - text: "GPU Architectures"
-        key: "rocm-pytorch-rocm-arch"
-        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
-        hint: "Semicolon-separated GPU architectures"
-      - select: "Upload Wheels to S3"
-        key: "rocm-upload-wheels"
-        default: "true"
-        options:
-          - label: "No - Build only (nightly/dev)"
-            value: "false"
-          - label: "Yes - Upload to S3 (release)"
-            value: "true"
-      - select: "Force Rebuild Base Wheels"
-        key: "rocm-force-rebuild"
-        default: "false"
-        hint: "Ignore S3 cache and rebuild base wheels from scratch"
-        options:
-          - label: "No - Use cached wheels if available"
-            value: "false"
-          - label: "Yes - Rebuild even if cache exists"
-            value: "true"

-  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
-  - label: ":rocm: Build ROCm Base Wheels"
-    id: build-rocm-base-wheels
-    depends_on:
-      - step: input-rocm-config
-        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
-      # Set configuration and check cache
-      - |
-        set -euo pipefail
-
-        # Get values from meta-data (set by input step) or use defaults
-        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
-        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
-
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        # Check for force rebuild flag
-        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
-        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
-          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
-        fi
-
-        echo "========================================"
-        echo "ROCm Base Wheels Build Configuration"
-        echo "========================================"
-        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
-        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
-        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
-        echo "========================================"
-
-        # Save resolved config for later jobs
-        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
-        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
-
-        # Check S3 cache for pre-built wheels
-        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
-        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
-        echo ""
-        echo "Cache key: $${CACHE_KEY}"
-        echo "Cache path: $${CACHE_PATH}"
-
-        # Save cache key for downstream jobs
-        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
-
-        CACHE_STATUS="miss"
-        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
-          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
-        else
-          echo "Force rebuild requested, skipping cache check"
-        fi
-
-        if [ "$${CACHE_STATUS}" = "hit" ]; then
-          echo ""
-          echo "CACHE HIT! Downloading pre-built wheels..."
-          echo ""
-          .buildkite/scripts/cache-rocm-base-wheels.sh download
-
-          # Set the S3 path for the cached Docker image (for Job 2 to download)
-          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
-          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Mark that we used cache (for Docker image handling)
-          buildkite-agent meta-data set "rocm-used-cache" "true"
-
-          echo ""
-          echo "Cache download complete. Skipping Docker build."
-          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-        else
-          echo ""
-          echo "CACHE MISS. Building from scratch..."
-          echo ""
-
-          # Build full base image (for later vLLM build)
-          DOCKER_BUILDKIT=1 docker buildx build \
-            --file docker/Dockerfile.rocm_base \
-            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
-            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
-            --build-arg USE_SCCACHE=1 \
-            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-            --build-arg SCCACHE_REGION_NAME=us-west-2 \
-            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-            --load \
-            .
-
-          # Build debs_wheel_release stage for wheel extraction
-          DOCKER_BUILDKIT=1 docker buildx build \
-            --file docker/Dockerfile.rocm_base \
-            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
-            --target debs_wheel_release \
-            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
-            --build-arg USE_SCCACHE=1 \
-            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-            --build-arg SCCACHE_REGION_NAME=us-west-2 \
-            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-            --load \
-            .
-
-          # Extract wheels from Docker image
-          mkdir -p artifacts/rocm-base-wheels
-          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
-          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
-          docker rm $${container_id}
-          echo "Extracted base wheels:"
-          ls -lh artifacts/rocm-base-wheels/
-
-          # Upload wheels to S3 cache for future builds
-          echo ""
-          echo "Uploading wheels to S3 cache..."
-          .buildkite/scripts/cache-rocm-base-wheels.sh upload
-
-          # Export base Docker image for reuse in vLLM build
-          mkdir -p artifacts/rocm-docker-image
-          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
-          echo "Docker image size:"
-          ls -lh artifacts/rocm-docker-image/
-
-          # Upload large Docker image to S3 (also cached by cache key)
-          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
-          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
-          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Save the S3 path for downstream jobs
-          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
-
-          # Mark that we did NOT use cache
-          buildkite-agent meta-data set "rocm-used-cache" "false"
-
-          echo ""
-          echo "Build complete. Wheels cached for future builds."
-        fi
-    artifact_paths:
-      - "artifacts/rocm-base-wheels/*.whl"
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel - x86_64"
-    id: build-rocm-vllm-wheel
-    depends_on:
-      - step: build-rocm-base-wheels
-        allow_failure: false
-    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 180
-    commands:
-      # Download artifacts and prepare Docker image
-      - |
-        set -euo pipefail
-
-        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
-        # This fixes version detection when tags are moved/force-pushed
-        echo "Fetching latest tags from origin..."
-        git fetch --tags --force origin
-        
-        # Log tag information for debugging version detection
-        echo "========================================"
-        echo "Git Tag Verification"
-        echo "========================================"
-        echo "Current HEAD: $(git rev-parse HEAD)"
-        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
-        echo ""
-        echo "Recent tags (pointing to commits near HEAD):"
-        git tag -l --sort=-creatordate | head -5
-        echo "setuptools_scm version detection:"
-        pip install -q setuptools_scm 2>/dev/null || true
-        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
-        echo "========================================"
-
-        # Download wheel artifacts from current build
-        echo "Downloading wheel artifacts from current build"
-        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
-
-        # Download Docker image from S3 (too large for Buildkite artifacts)
-        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
-        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
-          echo "ERROR: rocm-docker-image-s3-path metadata not found"
-          echo "This should have been set by the build-rocm-base-wheels job"
-          exit 1
-        fi
-        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
-        mkdir -p artifacts/rocm-docker-image
-        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
-
-        # Load base Docker image and capture the tag
-        echo "Loading base Docker image..."
-        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
-        echo "$${LOAD_OUTPUT}"
-        # Extract the actual loaded image tag from "Loaded image: <tag>" output
-        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
-        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
-        if [ -z "$${BASE_IMAGE_TAG}" ]; then
-          echo "ERROR: Failed to extract image tag from docker load output"
-          echo "Load output was: $${LOAD_OUTPUT}"
-          exit 1
-        fi
-        echo "Loaded base image: $${BASE_IMAGE_TAG}"
-
-        # Prepare base wheels for Docker build context
-        mkdir -p docker/context/base-wheels
-        touch docker/context/base-wheels/.keep
-        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
-        echo "Base wheels for vLLM build:"
-        ls -lh docker/context/base-wheels/
-
-        # Get GPU architectures from meta-data
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        echo "========================================"
-        echo "Building vLLM wheel with:"
-        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
-        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
-        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
-        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
-        echo "========================================"
-
-        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
-        DOCKER_BUILDKIT=1 docker build \
-          --file docker/Dockerfile.rocm \
-          --target export_vllm_wheel_release \
-          --output type=local,dest=rocm-dist \
-          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
-          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-          --build-arg REMOTE_VLLM=0 \
-          --build-arg GIT_REPO_CHECK=1 \
-          --build-arg USE_SCCACHE=1 \
-          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-          --build-arg SCCACHE_REGION_NAME=us-west-2 \
-          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-          .
-
-        echo "Built vLLM wheel:"
-        ls -lh rocm-dist/*.whl
-
-        # Copy wheel to artifacts directory
-        mkdir -p artifacts/rocm-vllm-wheel
-        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
-        echo "Final vLLM wheel:"
-        ls -lh artifacts/rocm-vllm-wheel/
-    artifact_paths:
-      - "artifacts/rocm-vllm-wheel/*.whl"
-    env:
-      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 3: Upload Wheels to S3
-  - label: ":s3: Upload ROCm Wheels to S3"
-    id: upload-rocm-wheels
-    depends_on:
-      - step: build-rocm-vllm-wheel
-        allow_failure: false
-    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 60
-    commands:
-      # Download all wheel artifacts and run upload
-      - |
-        set -euo pipefail
-
-        # Check if upload is enabled (from env var, meta-data, or release branch)
-        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
-        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
-          # Try to get from meta-data (input form)
-          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
-        fi
-
-        echo "========================================"
-        echo "Upload check:"
-        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
-        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
-        echo "========================================"
-
-        # Skip upload if not enabled
-        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
-          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
-          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
-          exit 0
-        fi
-
-        echo "Upload enabled, proceeding..."
-
-        # Download artifacts from current build
-        echo "Downloading artifacts from current build"
-        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
-        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
-
-        # Run upload script
-        bash .buildkite/scripts/upload-rocm-wheels.sh
-    env:
-      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 4: Annotate ROCm Wheel Release
-  - label: ":memo: Annotate ROCm wheel release"
-    id: annotate-rocm-release
-    depends_on:
-      - step: upload-rocm-wheels
-        allow_failure: true
-      - step: input-release-version
-        allow_failure: true
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-rocm-release.sh"
-    env:
-      S3_BUCKET: "vllm-wheels"
-
-  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
-  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
-  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
-  - block: "Generate Root Index for ROCm Wheels for Release"
-    key: block-generate-root-index-rocm-wheels
-    depends_on: upload-rocm-wheels
-
-  - label: ":package: Generate Root Index for ROCm Wheels for Release"
-    depends_on: block-generate-root-index-rocm-wheels
-    id: generate-root-index-rocm-wheels
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
-    env:
-      S3_BUCKET: "vllm-wheels"
-      VARIANT: "rocm700"
-
-  # ROCm Job 5: Build ROCm Release Docker Image
-  - label: ":docker: Build release image - x86_64 - ROCm"
-    id: build-rocm-release-image
-    depends_on:
-      - step: build-rocm-base-wheels
-        allow_failure: false
-    agents:
-      queue: cpu_queue_postmerge
-    timeout_in_minutes: 60
-    commands:
-      - |
-        set -euo pipefail
-
-        # Login to ECR
-        aws ecr-public get-login-password --region us-east-1 | \
-          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-
-        # Download Docker image from S3 (set by build-rocm-base-wheels)
-        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
-        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
-          echo "ERROR: rocm-docker-image-s3-path metadata not found"
-          exit 1
-        fi
-
-        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
-        mkdir -p artifacts/rocm-docker-image
-        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
-
-        # Load base Docker image
-        echo "Loading base Docker image..."
-        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
-        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
-        echo "Loaded base image: $${BASE_IMAGE_TAG}"
-
-        # Tag and push the base image to ECR
-        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
-        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
-
-        # Get GPU architectures from meta-data
-        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
-        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
-
-        # Build vLLM ROCm release image using cached base
-        DOCKER_BUILDKIT=1 docker build \
-          --build-arg max_jobs=16 \
-          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
-          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
-          --build-arg USE_SCCACHE=1 \
-          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
-          --build-arg SCCACHE_REGION_NAME=us-west-2 \
-          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
-          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
-          --target vllm-openai \
-          --progress plain \
-          -f docker/Dockerfile.rocm .
-
-        # Push to ECR
-        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
-        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
-    env:
-      DOCKER_BUILDKIT: "1"
-      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Pulling container" 
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"
+
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+#ignore certain kernels tests
+if [[ $commands == *" kernels "* ]]; then
+  commands="${commands} \
+  --ignore=kernels/test_attention_selector.py \
+  --ignore=kernels/test_blocksparse_attention.py \
+  --ignore=kernels/test_causal_conv1d.py \
+  --ignore=kernels/test_cutlass.py \
+  --ignore=kernels/test_encoder_decoder_attn.py \
+  --ignore=kernels/test_flash_attn.py \
+  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_int8_quant.py \
+  --ignore=kernels/test_machete_gemm.py \
+  --ignore=kernels/test_mamba_ssm.py \
+  --ignore=kernels/test_marlin_gemm.py \
+  --ignore=kernels/test_moe.py \
+  --ignore=kernels/test_prefix_prefill.py \
+  --ignore=kernels/test_rand.py \
+  --ignore=kernels/test_sampler.py \
+  --ignore=kernels/test_cascade_flash_attn.py \
+  --ignore=kernels/test_mamba_mixer2.py \
+  --ignore=kernels/test_aqlm.py \
+  --ignore=kernels/test_machete_mm.py \
+  --ignore=kernels/test_mha_attn.py \
+  --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_permute_cols.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_chat.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used   
+  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
+    docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --shm-size=16gb \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  docker run \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
+          --shm-size=16gb \
+          --rm \
+          -e HIP_VISIBLE_DEVICES=0 \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -5,26 +5,26 @@
 set -ex
 set -o pipefail

-# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

 # run python-based benchmarks and upload the result to buildkite
-vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?

-vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-vllm bench serve \
+python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+
+# Setup cleanup
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+  export BUILDKITE_BUILD_NUMBER=$3
+
+  # offline inference
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+
+  # Run AWQ test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
+
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  
+
+  # online serving
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    export VLLM_CPU_KVCACHE_SPACE=10 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/lora/test_qwen2vl.py"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+'
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jee Jee Li	966f933ee1	[Bugfix] Fix LoRA extra vocab size (#15047 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-03-18 10:51:10 -07:00
Isotr0py	1a504aff6c	[Bugfix] Fix broken CPU quantization due to triton import (#15038 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-03-18 10:51:10 -07:00
yury-tokpanov	01ca85bbd8	[MODEL] Add support for Zamba2 models (#13185 ) Signed-off-by: Yury Tokpanov <yury@zyphra.com> Signed-off-by: Quentin Anthony <qganthony@yahoo.com> Co-authored-by: Quentin Anthony <qganthony@yahoo.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-03-18 10:51:10 -07:00
Simon Mo	d82b9487ea	[Bugfix] Register serializers for V0 MQ Engine (#15009 ) Signed-off-by: simon-mo <simon.mo@hey.com>	2025-03-18 10:51:10 -07:00
Cyrus Leung	be13281d4b	[Bugfix] Loosen type check to avoid errors in V1 (#15021 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-18 10:51:10 -07:00
hoshi-hiyouga	54e084f7fb	[Bugfix] torchrun compatibility (#14899 ) Signed-off-by: hiyouga <hiyouga@buaa.edu.cn> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	2025-03-18 10:51:10 -07:00
Varun Sundar Rabindranath	9e8f089d08	[Kernels] LoRA - Retire SGMV and BGMV Kernels (#14685 ) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>	2025-03-18 10:51:10 -07:00
Robert Shaw	16e9064f84	[V1] Guard Against Main Thread Usage (#14972 ) Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-17 13:23:17 -07:00
Roger Wang	5ac1a8e6e4	[Bugfix] Fix interface for Olmo2 on V1 (#14976 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-03-17 11:41:43 -07:00
				`@@ -1 +0,0 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`