Compare commits

..

4 Commits

Author SHA1 Message Date
Lucas Wilkinson
275de34170 [BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
(cherry picked from commit 8f4f77a727)
2025-11-19 14:11:21 -08:00
Julien Denize
fa3ffb4365 [BugFix] Ray with multiple nodes (#28873)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
(cherry picked from commit cdeec2e606)
2025-11-19 14:11:08 -08:00
Lucas Wilkinson
6d5974369c [BugFix] Fix async-scheduling + FlashAttn MLA (#28990)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
(cherry picked from commit 48fc8b1e59)
2025-11-19 14:10:50 -08:00
Johnny
0ce9990d2c [NVIDIA] Guard SM100 CUTLASS MoE macro to SM100 builds v2 (#28938)
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
(cherry picked from commit 49ef847aa8)
2025-11-19 14:10:37 -08:00
3339 changed files with 144946 additions and 465970 deletions

View File

@@ -1,25 +0,0 @@
name: vllm_ci
job_dirs:
- ".buildkite/image_build"
- ".buildkite/test_areas"
- ".buildkite/hardware_tests"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-postmerge-repo"
premerge: "vllm-ci-test-repo"

View File

@@ -1,23 +0,0 @@
name: vllm_intel_ci
job_dirs:
- ".buildkite/intel_jobs"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/xpu.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-test-repo"
premerge: "vllm-ci-test-repo"

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
)

View File

@@ -1,30 +0,0 @@
group: Hardware - AMD Build
steps:
- label: "AMD: :docker: build image"
key: image-build-amd
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1

View File

@@ -1,10 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Ascend NPU Test"
soft_fail: true
timeout_in_minutes: 20
no_plugin: true
device: ascend_npu
commands:
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh

View File

@@ -1,108 +0,0 @@
group: CPU
depends_on: []
steps:
- label: CPU-Kernel Tests
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- cmake/cpu_extension.cmake
- CMakeLists.txt
- vllm/_custom_ops.py
- tests/kernels/attention/test_cpu_attn.py
- tests/kernels/moe/test_cpu_fused_moe.py
- tests/kernels/test_onednn.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/
- tests/models/language/generation/
- tests/models/language/pooling/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
- label: CPU-Quantization Model Tests
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/model_executor/layers/quantization/cpu_wna16.py
- vllm/model_executor/layers/quantization/gptq_marlin.py
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
- tests/quantization/test_compressed_tensors.py
- tests/quantization/test_cpu_wna16.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
- label: CPU-Distributed Tests
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/shm.cpp
- vllm/v1/worker/cpu_worker.py
- vllm/v1/worker/gpu_worker.py
- vllm/v1/worker/cpu_model_runner.py
- vllm/v1/worker/gpu_model_runner.py
- vllm/platforms/cpu.py
- vllm/distributed/parallel_state.py
- vllm/distributed/device_communicators/cpu_communicator.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
- label: CPU-Multi-Modal Model Tests %N
depends_on: []
device: intel_cpu
no_plugin: true
source_file_dependencies:
# - vllm/
- vllm/model_executor/layers/rotary_embedding
- tests/models/multimodal/generation/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
parallelism: 2
- label: "Arm CPU Test"
depends_on: []
soft_fail: false
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -1,10 +0,0 @@
group: Hardware
steps:
- label: "GH200 Test"
soft_fail: true
device: gh200
no_plugin: true
optional: true
commands:
- nvidia-smi
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh

View File

@@ -1,17 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Intel HPU Test"
soft_fail: true
device: intel_hpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- label: "Intel GPU Test"
depends_on: []
soft_fail: true
device: intel_gpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh

View File

@@ -1,255 +0,0 @@
#!/bin/bash
set -euo pipefail
# replace invalid characters in Docker image tags and truncate to 128 chars
clean_docker_tag() {
local input="$1"
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
exit 1
}
print_instance_info() {
echo ""
echo "=== Debug: Instance Information ==="
# Get IMDSv2 token
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
echo "AMI ID: ${AMI_ID}"
echo "Instance Type: ${INSTANCE_TYPE}"
echo "Instance ID: ${INSTANCE_ID}"
echo "AZ: ${AZ}"
else
echo "Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if [[ -f /etc/vllm-ami-info ]]; then
echo "Cache: warm (custom vLLM AMI)"
cat /etc/vllm-ami-info
else
echo "Cache: cold (standard AMI)"
fi
echo "==================================="
echo ""
}
setup_buildx_builder() {
echo "--- :buildkite: Setting up buildx builder"
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
echo "Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo "Creating baked-vllm-builder with remote driver"
docker buildx create \
--name baked-vllm-builder \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi
# builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
}
check_and_skip_if_image_exists() {
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi
}
ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags() {
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
cache="${MAIN_CACHE_ECR}:latest"
else
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
cache="${TEST_CACHE_ECR}:${clean_branch}"
fi
CACHE_TO="$cache"
CACHE_FROM="$cache"
CACHE_FROM_BASE_BRANCH="$cache"
else
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
else
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
fi
fi
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit() {
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi
}
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
}
#################################
# Main Script #
#################################
print_instance_info
if [[ $# -lt 5 ]]; then
print_usage_and_exit
fi
# input args
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
IMAGE_TAG=$5
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
# build config
TARGET="test-ci"
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
prepare_cache_tags
ecr_login
# Environment info (for docs and human readers)
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (exported as environment variables for bake):
export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
# print args
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
# print build configuration
echo "--- :mag: Build configuration"
echo "TARGET: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "BUILDER_NAME: ${BUILDER_NAME}"
echo "CI_HCL_URL: ${CI_HCL_URL}"
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
echo "--- :mag: Cache tags"
echo "CACHE_TO: ${CACHE_TO}"
echo "CACHE_FROM: ${CACHE_FROM}"
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "CI HCL path: ${CI_HCL_PATH}"
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
echo "Make sure you're running from the vLLM repository root"
exit 1
fi
echo "--- :arrow_down: Downloading ci.hcl"
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
echo "Downloaded to ${CI_HCL_PATH}"
if [[ ! -f "${CI_HCL_PATH}" ]]; then
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
exit 1
fi
setup_buildx_builder
resolve_parent_commit
export PARENT_COMMIT
print_bake_config
echo "--- :docker: Building ${TARGET}"
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
echo "--- :white_check_mark: Build complete"

View File

@@ -1,58 +0,0 @@
group: Abuild
steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
timeout_in_minutes: 600
commands:
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU image"
key: image-build-cpu
depends_on: []
commands:
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build HPU image"
soft_fail: true
depends_on: []
key: image-build-hpu
commands:
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []
optional: true
commands:
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_X86=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--target vllm-test \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu

View File

@@ -1,33 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--target vllm-test \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# skip build if image already exists
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file docker/Dockerfile.xpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu

View File

@@ -1,64 +0,0 @@
group: Intel
steps:
- label: ":docker: Build XPU image"
soft_fail: true
depends_on: []
key: image-build-xpu
commands:
- bash -lc '.buildkite/image_build/image_build_xpu.sh "public.ecr.aws/q9t5s3a7" "vllm-ci-test-repo" "$BUILDKITE_COMMIT"'
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: "XPU example Test"
depends_on:
- image-build-xpu
timeout_in_minutes: 30
device: intel_gpu
no_plugin: true
env:
REGISTRY: "public.ecr.aws/q9t5s3a7"
REPO: "vllm-ci-test-repo"
source_file_dependencies:
- vllm/
- .buildkite/intel_jobs/test-intel.yaml
commands:
- >-
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
'pip install tblib==3.1.0 &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 &&
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 &&
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 &&
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel'
- label: "XPU V1 test"
depends_on:
- image-build-xpu
timeout_in_minutes: 30
device: intel_gpu
no_plugin: true
env:
REGISTRY: "public.ecr.aws/q9t5s3a7"
REPO: "vllm-ci-test-repo"
source_file_dependencies:
- vllm/
- .buildkite/intel_jobs/test-intel.yaml
commands:
- >-
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
'cd tests &&
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py &&
pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py &&
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py &&
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
pytest -v -s v1/structured_output &&
pytest -v -s v1/test_serial_utils.py &&
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py'

View File

@@ -8,4 +8,3 @@ tasks:
value: 0.80 value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5 num_fewshot: 5
rtol: 0.05

View File

@@ -1,15 +0,0 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.695
- name: "exact_match,flexible-extract"
value: 0.447
limit: 1319
num_fewshot: 5
max_model_len: 262144
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -1,19 +0,0 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.7142
- name: "exact_match,flexible-extract"
value: 0.4579
env_vars:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
limit: 1319
num_fewshot: 5
max_model_len: 262144
kv_cache_dtype: fp8
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.6353
- name: "exact_match,flexible-extract"
value: 0.637
limit: null
num_fewshot: null

View File

@@ -1,2 +1 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml Qwen3-235B-A22B-Instruct-2507-FP8.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml

View File

@@ -1 +0,0 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml

View File

@@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml DeepSeek-V2-Lite-Chat.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml

View File

@@ -1,5 +0,0 @@
Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install lm-eval==0.4.9
usage() { usage() {
echo`` echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \ --tasks chartqa \
--batch_size auto \ --batch_size auto \
--apply_chat_template \ --apply_chat_template \
--limit "$LIMIT" --limit $LIMIT

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``
@@ -20,11 +20,14 @@ usage() {
echo echo
} }
while getopts "m:l:f:t:" OPT; do while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in case ${OPT} in
m ) m )
MODEL="$OPTARG" MODEL="$OPTARG"
;; ;;
b )
BATCH_SIZE="$OPTARG"
;;
l ) l )
LIMIT="$OPTARG" LIMIT="$OPTARG"
;; ;;

View File

@@ -9,41 +9,11 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1 --tp-size=1
""" """
import os
from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np
import yaml import yaml
from vllm.platforms import current_platform RTOL = 0.08
DEFAULT_RTOL = 0.08
@contextmanager
def scoped_env_vars(new_env: dict[str, str]):
if not new_env:
# Fast path: nothing to do
yield
return
old_values = {}
new_keys = []
try:
for key, value in new_env.items():
if key in os.environ:
old_values[key] = os.environ[key]
else:
new_keys.append(key)
os.environ[key] = str(value)
yield
finally:
# Restore / clean up
for key, value in old_values.items():
os.environ[key] = value
for key in new_keys:
os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
@@ -61,31 +31,24 @@ def launch_lm_eval(eval_config, tp_size):
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len},"
"allow_deprecated_quantization=True,"
) )
results = lm_eval.simple_evaluate(
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]: model=backend,
model_args += "attention_backend=TRITON_ATTN" model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
env_vars = eval_config.get("env_vars", None) num_fewshot=eval_config["num_fewshot"],
with scoped_env_vars(env_vars): limit=eval_config["limit"],
results = lm_eval.simple_evaluate( # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
model=backend, # text models. however, this is regressing measured strict-match for
model_args=model_args, # existing text models in CI, so only apply it for mm, or explicitly set
tasks=[task["name"] for task in eval_config["tasks"]], apply_chat_template=eval_config.get(
num_fewshot=eval_config["num_fewshot"], "apply_chat_template", backend == "vllm-vlm"
limit=eval_config["limit"], ),
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
# text models. however, this is regressing measured strict-match for # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
# existing text models in CI, so only apply it for mm, or explicitly set gen_kwargs=eval_config.get("gen_kwargs"),
apply_chat_template=eval_config.get( batch_size=batch_size,
"apply_chat_template", backend == "vllm-vlm" )
),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
return results return results
@@ -94,8 +57,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size) results = launch_lm_eval(eval_config, tp_size)
rtol = eval_config.get("rtol", DEFAULT_RTOL)
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
@@ -103,11 +64,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print( print(
f"{task['name']} | {metric['name']}: " f"{task['name']} | {metric['name']}: "
f"ground_truth={ground_truth:.3f} | " f"ground_truth={ground_truth} | measured={measured_value}"
f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
assert success assert success

View File

@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
## Performance benchmark quick overview ## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models. **Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
**Benchmarking Duration**: about 1hr. **Benchmarking Duration**: about 1hr.
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Runtime environment variables: Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,9 +34,8 @@ Runtime environment variables:
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead. >
### Latency test ### Latency test
Here is an example of one test inside `latency-tests.json`: Here is an example of one test inside `latency-tests.json`:
@@ -83,6 +82,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -108,65 +108,6 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Default Parameters Field
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
<details>
<summary> An Example of default parameters field </summary>
```json
{
"defaults": {
"qps_list": [
"inf"
],
"server_environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
},
"server_parameters": {
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"block_size": 128,
"disable_log_stats": "",
"load_format": "dummy"
},
"client_parameters": {
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 200,
"ignore-eos": ""
}
},
"tests": [
{
"test_name": "serving_llama3B_tp2_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 2,
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
}
},
{
"test_name": "serving_qwen3_tp4_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-14B",
"tensor_parallel_size": 4,
},
"client_parameters": {
"model": "Qwen/Qwen3-14B",
}
},
]
}
```
</details>
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
@@ -175,6 +116,19 @@ If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
#### Performance Results Comparison The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide. Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
A comparison diagram will be generated below the table.
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />

View File

@@ -393,7 +393,7 @@ if __name__ == "__main__":
with open(results_folder / md_file, "w") as f: with open(results_folder / md_file, "w") as f:
results = read_markdown( results = read_markdown(
"../.buildkite/performance-benchmarks/" "../.buildkite/performance-benchmarks/"
"performance-benchmarks-descriptions.md" + "performance-benchmarks-descriptions.md"
) )
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,

View File

@@ -1,4 +1,6 @@
#!/bin/bash #!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory # This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/ # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -7,26 +9,14 @@
set -x set -x
set -o pipefail set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() { check_gpus() {
if command -v nvidia-smi; then if command -v nvidia-smi; then
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true) declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true) declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
elif command -v hl-smi; then elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true) declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
fi fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
@@ -54,16 +44,12 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then if [[ $numa_count -gt 0 ]]; then
echo "NUMA found." echo "NUMA found."
echo "$numa_count" echo $numa_count
else else
echo "Need at least 1 NUMA to run benchmarking." echo "Need at least 1 NUMA to run benchmarking."
exit 1 exit 1
fi fi
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then declare -g gpu_type="cpu"
declare -g gpu_type="arm64-cpu"
else
declare -g gpu_type="cpu"
fi
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
@@ -122,12 +108,12 @@ json2envs() {
} }
wait_for_server() { wait_for_server() {
local timeout_val="1200" # wait for vllm server to start
timeout "$timeout_val" bash -c ' # return 1 if vllm server crashes
until curl -sf http://localhost:8000/v1/models >/dev/null; do timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1 sleep 1
done done' && return 0 || return 1
'
} }
kill_processes_launched_by_current_bash() { kill_processes_launched_by_current_bash() {
@@ -190,318 +176,19 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
# ------------------------------- run_latency_tests() {
# Adaptive concurrency helpers # run latency tests using `vllm bench latency` command
# ------------------------------- # $1: a json file specifying latency test cases
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() { local latency_test_file
local metric_name=$1 latency_test_file=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0 # Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
# $2: a json file specifying test cases
local test_type=$1
local test_file=$2
# Iterate over tests
jq -c '.[]' "$test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"." echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1 exit 1
fi fi
@@ -512,15 +199,15 @@ run_benchmark_tests() {
fi fi
# get arguments # get arguments
bench_params=$(echo "$params" | jq -r '.parameters') latency_params=$(echo "$params" | jq -r '.parameters')
bench_args=$(json2args "$bench_params") latency_args=$(json2args "$latency_params")
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables') latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
bench_envs=$(json2envs "$bench_environment_variables") latency_envs=$(json2envs "$latency_environment_variables")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size') tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1') pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp)) world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -533,107 +220,108 @@ run_benchmark_tests() {
fi fi
fi fi
bench_command=" $bench_envs vllm bench $test_type \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$bench_args" $latency_args"
echo "Running test case $test_name" echo "Running test case $test_name"
echo "${test_type^} command: $bench_command" echo "Latency command: $latency_command"
# recording benchmarking command and GPU command # recoding benchmarking command ang GPU command
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg command "$bench_command" \ --arg latency "$latency_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg test_type "$test_type" \
'{ '{
($test_type + "_command"): $command, latency_command: $latency,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$bench_command" eval "$latency_command"
kill_gpu_processes kill_gpu_processes
done done
} }
run_latency_tests() { run_benchmark_tests "latency" "$1"; } run_throughput_tests() {
run_startup_tests() { run_benchmark_tests "startup" "$1"; } # run throughput tests using `vllm bench throughput`
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; } # $1: a json file specifying throughput test cases
merge_serving_tests_stream() { local throughput_test_file
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode. throughput_test_file=$1
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1" # Iterate over throughput tests
# shellcheck disable=SC2016 jq -c '.[]' "$throughput_test_file" | while read -r params; do
local merged=' # get the test name, and append the GPU type back to it.
if type == "array" then test_name=$(echo "$params" | jq -r '.test_name')
# Plain format: test cases array if [[ ! "$test_name" =~ ^throughput_ ]]; then
.[] echo "In throughput-test.json, test_name must start with \"throughput_\"."
elif (type == "object" and has("tests")) then exit 1
# merge the default parameters into each test cases fi
. as $root
| ($root.defaults // {}) as $d # if TEST_SELECTOR is set, only run the test cases that match the selector
| ($root.tests // [])[] if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
# default qps / max_concurrency from defaults if missing echo "Skip test case $test_name."
| .qps_list = (.qps_list // $d.qps_list) continue
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list) fi
# merge envs / params: test overrides defaults
| .server_environment_variables = # get arguments
(($d.server_environment_variables // {}) + (.server_environment_variables // {})) throughput_params=$(echo "$params" | jq -r '.parameters')
| .server_parameters = throughput_args=$(json2args "$throughput_params")
(($d.server_parameters // {}) + (.server_parameters // {})) throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
| .client_parameters = throughput_envs=$(json2envs "$throughput_environment_variables")
(($d.client_parameters // {}) + (.client_parameters // {}))
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else else
error("Unsupported serving test file format: must be array or object with .tests") if [[ $gpu_count -lt $tp ]]; then
end echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
' continue
fi
fi
jq -c "$merged" "$serving_test_file" | \ throughput_command=" $throughput_envs vllm bench throughput \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then --output-json $RESULTS_FOLDER/${test_name}.json \
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" ' $throughput_args"
select((($model|length)==0)
or ((.server_parameters.model // "") == $model) echo "Running test case $test_name"
or ((.client_parameters.model // "") == $model)) echo "Throughput command: $throughput_command"
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype)) # recoding benchmarking command ang GPU command
' jq_output=$(jq -n \
else --arg command "$throughput_command" \
cat --arg gpu "$gpu_type" \
fi '{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
} }
run_serving_tests() { run_serving_tests() {
# run serving tests using `vllm bench serve` command # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file local serving_test_file
serving_test_file=$1 serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast. # Iterate over serving tests
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then jq -c '.[]' "$serving_test_file" | while read -r params; do
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -647,71 +335,28 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments (after merged the default parameters) # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables') server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_envs=$(json2envs "$server_envs") server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list" echo "Running over qps list $qps_list"
# max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts') num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]" max_concurrency_list="[$num_prompts]"
fi fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list" echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test # check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1') pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp)) world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -725,13 +370,14 @@ run_serving_tests() {
fi fi
# check if server model and client model is aligned # check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name." echo "Server model and client model must be the same. Skip testcase $test_name."
continue continue
fi fi
server_command="$server_envs vllm serve $server_model \ server_command="$server_envs vllm serve \
$server_args" $server_args"
# run the server # run the server
@@ -739,7 +385,7 @@ run_serving_tests() {
echo "Server command: $server_command" echo "Server command: $server_command"
# support remote vllm server # support remote vllm server
client_remote_args="" client_remote_args=""
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" & bash -c "$server_command" &
server_pid=$! server_pid=$!
# wait until the server is alive # wait until the server is alive
@@ -750,9 +396,6 @@ run_serving_tests() {
echo "" echo ""
echo "vLLM failed to start within the timeout period." echo "vLLM failed to start within the timeout period."
fi fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then if [[ ${REMOTE_PORT} ]]; then
@@ -762,48 +405,34 @@ run_serving_tests() {
fi fi
fi fi
# save the compilation mode and optimization level on the serving results
# whenever they are set
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
# iterate over different QPS # iterate over different QPS
for qps in $qps_list; do for qps in $qps_list; do
# remove the surrounding single quote from qps # remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf" qps="inf"
echo "now qps is $qps"
fi fi
# iterate over different max_concurrency # iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}" new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name" echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts. # pass the tensor parallel size to the client so that it can be displayed
num_prompts_arg="" # on the benchmark dashboard
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \ client_command="vllm bench serve \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
--request-rate $qps \ --request-rate $qps \
--max-concurrency $max_concurrency \ --max-concurrency $max_concurrency \
$num_prompts_arg \ --metadata "tensor_parallel_size=$tp" \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \ $client_args $client_remote_args "
$client_args_effective $client_remote_args "
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then bash -c "$client_command"
bash -c "$client_command"
fi
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
@@ -818,39 +447,25 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done done
# clean up # clean up
if [[ "${DRY_RUN:-0}" != "1" ]]; then kill -9 $server_pid
kill -9 "$server_pid" kill_gpu_processes
kill_gpu_processes
fi
done done
} }
main() { main() {
local ARCH local ARCH
ARCH='' ARCH=''
if [[ "$ON_CPU" == "1" ]]; then if [ "$ON_CPU" == "1" ];then
check_cpus check_cpus
ARCH="-$gpu_type" ARCH='-cpu'
else else
check_gpus check_gpus
ARCH="$arch_suffix" ARCH="$arch_suffix"
fi fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -871,24 +486,17 @@ main() {
# dump vllm info via vllm collect-env # dump vllm info via vllm collect-env
env_output=$(vllm collect-env) env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $? run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite upload_to_buildkite
} }

View File

@@ -1,26 +0,0 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@@ -51,56 +51,5 @@
"max-model-len": 256, "max-model-len": 256,
"async-scheduling": "" "async-scheduling": ""
} }
},
{
"test_name": "latency_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"max-model-len": 2048,
"dtype": "bfloat16"
}
},
{
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"max-model-len": 512,
"max-num-seqs": 128,
"async-scheduling": "",
"gpu-memory-utilization": 0.95,
"enable_expert_parallel": ""
}
},
{
"test_name": "latency_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1,
"max-model-len": 2048,
"max-num-seqs": 128,
"dtype": "bfloat16",
"async-scheduling": ""
}
} }
] ]

View File

@@ -1,130 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
12,
16,
24,
32,
64,
128,
200
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
}
]
}

View File

@@ -1,37 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,41 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -0,0 +1,610 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -1,355 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -1,142 +1,276 @@
{ [
"defaults": { {
"qps_list": [ "test_name": "serving_llama8B_tp1_sharegpt",
"inf" "qps_list": [1, 4, 16, "inf"],
], "max_concurrency_list": [32],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": {
"server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000,
"VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40
"VLLM_CPU_KVCACHE_SPACE": 40 },
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
"server_parameters": { {
"model": "meta-llama/Llama-3.1-8B-Instruct", "test_name": "serving_llama8B_tp2_sharegpt",
"tensor_parallel_size": 1, "qps_list": [1, 4, 16, "inf"],
"dtype": "bfloat16", "max_concurrency_list": [32],
"distributed_executor_backend": "mp", "server_environment_variables": {
"block_size": 128, "VLLM_RPC_TIMEOUT": 100000,
"trust_remote_code": "", "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"disable_log_stats": "", "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"max_num_batched_tokens": 2048, "VLLM_CPU_SGL_KERNEL": 1,
"max_num_seqs": 256 "VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
"client_parameters": { {
"model": "meta-llama/Llama-3.1-8B-Instruct", "test_name": "serving_llama8B_tp1_random_128_128",
"backend": "vllm", "qps_list": [1, 4, 16, "inf"],
"ignore-eos": "", "max_concurrency_list": [32],
"num_prompts": 200 "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
} }
}, ]
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
}
]
}

View File

@@ -10,6 +10,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -36,6 +37,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -62,6 +64,7 @@
"server_parameters": { "server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -75,83 +78,5 @@
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200 "num_prompts": 200
} }
},
{
"test_name": "serving_deepseek_r1",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 200,
"async-scheduling": "",
"dtype": "bfloat16"
},
"client_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"max-model-len": 2048,
"max-num-seqs": 128,
"async-scheduling": "",
"enable_expert_parallel": "",
"max-num-batched-tokens": 4096
},
"client_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_qwen3_8b",
"qps_list": [1, 4, 10, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"disable_log_stats": "",
"async-scheduling": ""
},
"client_parameters": {
"model": "Qwen/Qwen-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
} }
] ]

View File

@@ -5,6 +5,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -22,6 +23,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -39,6 +41,7 @@
"server_parameters": { "server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -56,6 +59,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": { "speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct", "model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4, "num_speculative_tokens": 4,

View File

@@ -1,27 +0,0 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@@ -57,67 +57,5 @@
"max-num-seqs": 512, "max-num-seqs": 512,
"async-scheduling": "" "async-scheduling": ""
} }
},
{
"test_name": "throughput_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 384,
"async-scheduling": ""
}
},
{
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": "",
"enable_expert_parallel": ""
}
},
{
"test_name": "throughput_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"max-num-seqs": 512,
"backend": "vllm",
"async-scheduling": ""
}
} }
] ]

View File

@@ -1,694 +1,179 @@
steps: steps:
# aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# Build release images (12.9)
- label: "Build release image (x86)"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
- label: "Create multi-arch manifest"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: release-version
- group: "Build Python wheels" - block: "Build CPU release image"
key: "build-wheels" key: block-cpu-release-image-build
steps:
- label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_release
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_release
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CPU"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Generate and upload wheel indices"
depends_on: "build-wheels"
allow_dependency_failure: true
agents:
queue: cpu_queue_release
commands:
- "bash .buildkite/scripts/generate-and-upload-nightly-index.sh"
- group: "Build release Docker images"
key: "build-release-images"
steps:
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: ~
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: ~
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-cuda-13-0-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-cuda-13-0-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build release image - x86_64 - CPU"
depends_on:
- block-cpu-release-image-build
- input-release-version
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image for arm64 CPU"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build release image - arm64 - CPU"
depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- group: "Publish release images"
key: "publish-release-images"
steps:
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-ubuntu2404
- build-release-image-arm64-ubuntu2404
id: create-multi-arch-manifest-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-cuda-13-0-ubuntu2404
- build-release-image-arm64-cuda-13-0-ubuntu2404
id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- group: "Publish wheels"
key: "publish-wheels"
steps:
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheels
- label: "Upload release wheels to PyPI"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# =============================================================================
#
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
# To build a specific version, trigger the build from that branch/tag.
#
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
#
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
#
# =============================================================================
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
- label: ":rocm: Build ROCm Base Image & Wheels"
id: build-rocm-base-wheels
depends_on: ~ depends_on: ~
- label: "Build and publish CPU release image"
depends_on: block-cpu-release-image-build
agents: agents:
queue: cpu_queue_release queue: cpu_queue_postmerge
commands: commands:
- | - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
set -euo pipefail - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
# Generate cache key - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
ECR_CACHE_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:$${CACHE_KEY}-rocm-base"
echo "========================================"
echo "ROCm Base Build Configuration"
echo "========================================"
echo " CACHE_KEY: $${CACHE_KEY}"
echo " ECR_CACHE_TAG: $${ECR_CACHE_TAG}"
echo "========================================"
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
IMAGE_EXISTS=false
WHEELS_EXIST=false
# Check ECR for Docker image
if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then
IMAGE_EXISTS=true
echo "ECR image cache HIT"
fi
# Check S3 for wheels
WHEEL_CACHE_STATUS=$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
if [ "$${WHEEL_CACHE_STATUS}" = "hit" ]; then
WHEELS_EXIST=true
echo "S3 wheels cache HIT"
fi
# Scenario 1: Both cached (best case)
if [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "true" ]; then
echo ""
echo "FULL CACHE HIT - Reusing both image and wheels"
echo ""
# Download wheels
.buildkite/scripts/cache-rocm-base-wheels.sh download
# Save ECR tag for downstream jobs
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
# Scenario 2: Full rebuild needed
else
echo ""
echo " CACHE MISS - Building from scratch..."
echo ""
# Build full base image and push to ECR
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag "$${ECR_CACHE_TAG}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--push \
.
# Build wheel extraction stage
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
--target debs_wheel_release \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
# Extract and upload wheels
mkdir -p artifacts/rocm-base-wheels
cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
docker rm $${cid}
.buildkite/scripts/cache-rocm-base-wheels.sh upload
# Cache base docker image to ECR
docker push "$${ECR_CACHE_TAG}"
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
echo ""
echo " Build complete - Image and wheels cached"
fi
artifact_paths:
- "artifacts/rocm-base-wheels/*.whl"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel - block: "Build arm64 CPU release image"
- label: ":python: Build vLLM ROCm Wheel - x86_64" key: block-arm64-cpu-release-image-build
id: build-rocm-vllm-wheel depends_on: ~
depends_on:
- step: build-rocm-base-wheels - label: "Build and publish arm64 CPU release image"
allow_failure: false depends_on: block-arm64-cpu-release-image-build
agents: agents:
queue: cpu_queue_release queue: arm64_cpu_queue_postmerge
timeout_in_minutes: 180
commands: commands:
# Download artifacts and prepare Docker image - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- | - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
set -euo pipefail - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
# This fixes version detection when tags are moved/force-pushed
echo "Fetching latest tags from origin..."
git fetch --tags --force origin
# Log tag information for debugging version detection
echo "========================================"
echo "Git Tag Verification"
echo "========================================"
echo "Current HEAD: $(git rev-parse HEAD)"
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
echo ""
echo "Recent tags (pointing to commits near HEAD):"
git tag -l --sort=-creatordate | head -5
echo "setuptools_scm version detection:"
pip install -q setuptools_scm 2>/dev/null || true
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
echo "========================================"
# Download wheel artifacts from current build
echo "Downloading wheel artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
if [ -z "$${ECR_IMAGE_TAG}" ]; then
echo "ERROR: rocm-base-image-tag metadata not found"
echo "This should have been set by the build-rocm-base-wheels job"
exit 1
fi
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Pull base Docker image from ECR
docker pull "$${ECR_IMAGE_TAG}"
echo "Loaded base image: $${ECR_IMAGE_TAG}"
# Prepare base wheels for Docker build context
mkdir -p docker/context/base-wheels
touch docker/context/base-wheels/.keep
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
echo "Base wheels for vLLM build:"
ls -lh docker/context/base-wheels/
echo "========================================"
echo "Building vLLM wheel with:"
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
echo "========================================"
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
DOCKER_BUILDKIT=1 docker build \
--file docker/Dockerfile.rocm \
--target export_vllm_wheel_release \
--output type=local,dest=rocm-dist \
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
--build-arg REMOTE_VLLM=0 \
--build-arg GIT_REPO_CHECK=1 \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
.
echo "Built vLLM wheel:"
ls -lh rocm-dist/*.whl
# Copy wheel to artifacts directory
mkdir -p artifacts/rocm-vllm-wheel
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
echo "Final vLLM wheel:"
ls -lh artifacts/rocm-vllm-wheel/
artifact_paths:
- "artifacts/rocm-vllm-wheel/*.whl"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 3: Upload Wheels to S3 - label: "Build and publish nightly multi-arch image to DockerHub"
- label: ":s3: Upload ROCm Wheels to S3"
id: upload-rocm-wheels
depends_on: depends_on:
- step: build-rocm-vllm-wheel - create-multi-arch-manifest
allow_failure: false
agents:
queue: cpu_queue_release
timeout_in_minutes: 60
commands:
# Download all wheel artifacts and run upload
- |
set -euo pipefail
# Download artifacts from current build
echo "Downloading artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
# Run upload script
bash .buildkite/scripts/upload-rocm-wheels.sh
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 4: Annotate ROCm Wheel Release
- label: ":memo: Annotate ROCm wheel release"
id: annotate-rocm-release
depends_on:
- upload-rocm-wheels
agents:
queue: cpu_queue_release
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
S3_BUCKET: "vllm-wheels"
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
- block: "Generate Root Index for ROCm Wheels for Release"
key: block-generate-root-index-rocm-wheels
depends_on: upload-rocm-wheels
- label: ":package: Generate Root Index for ROCm Wheels for Release"
depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels
agents:
queue: cpu_queue_release
commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
VARIANT: "rocm721"
# ROCm Job 6: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
id: build-rocm-release-image
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_release
timeout_in_minutes: 60
commands:
- |
set -euo pipefail
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
if [ -z "$${ECR_IMAGE_TAG}" ]; then
echo "ERROR: rocm-base-image-tag metadata not found"
echo "This should have been set by the build-rocm-base-wheels job"
exit 1
fi
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
# Pull base Docker image from ECR
docker pull "$${ECR_IMAGE_TAG}"
echo "Loaded base image: $${ECR_IMAGE_TAG}"
# Pass the base image ECR tag to downstream steps (nightly publish)
buildkite-agent meta-data set "rocm-base-ecr-tag" "$${ECR_IMAGE_TAG}"
echo "========================================"
echo "Building vLLM ROCm release image with:"
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
echo "========================================"
# Build vLLM ROCm release image using cached base
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
echo ""
echo " Successfully built and pushed ROCm release image"
echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
echo ""
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
- label: "Publish nightly ROCm image to DockerHub"
depends_on:
- build-rocm-release-image
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: small_cpu_queue_release queue: cpu_queue_postmerge
commands: commands:
- "bash .buildkite/scripts/push-nightly-builds-rocm.sh" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- vllm/vllm-openai-rocm" - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
- "bash .buildkite/scripts/cleanup-nightly-builds.sh base-nightly- vllm/vllm-openai-rocm"
plugins: plugins:
- docker-login#v3.0.0: - docker-login#v3.0.0:
username: vllmbot username: vllmbot

View File

@@ -8,41 +8,30 @@ if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev" RELEASE_VERSION="1.0.0.dev"
fi fi
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit): To download the wheel (by commit):
\`\`\` \`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
(Optional) For CUDA 13.0: aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\` \`\`\`
To download the wheel (by version):
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image: To download and upload the image:
\`\`\` \`\`\`
# Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
# Tag and push images:
## CUDA
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -50,70 +39,15 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64 docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
## ROCm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
## CPU
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai-cpu:latest-x86_64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
# Create multi-arch manifest:
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest push vllm/vllm-openai-cpu:latest
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
\`\`\` \`\`\`
EOF EOF

View File

@@ -1,113 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Generate Buildkite annotation for ROCm wheel release
set -ex
# Extract build configuration from Dockerfile.rocm_base (single source of truth)
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev"
fi
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
# S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases
### Build Configuration
| Setting | Value |
|---------|-------|
| **ROCm Version** | ${ROCM_VERSION} |
| **Python Version** | ${PYTHON_VERSION} |
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
| **Branch** | \`${BUILDKITE_BRANCH}\` |
| **Commit** | \`${BUILDKITE_COMMIT}\` |
### :package: Installation
**Install from this build (by commit):**
\`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
# Example for ROCm ${ROCM_VERSION}:
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
\`\`\`
**Install from nightly (if published):**
\`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
\`\`\`
### :floppy_disk: Download Wheels Directly
\`\`\`bash
# List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
# Download specific wheels
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
\`\`\`
### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton
- **triton-kernels**: Triton kernels
- **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
### :package: Docker Image Release
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Cache helper for ROCm base wheels
#
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
#
# Usage:
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
# cache-rocm-base-wheels.sh download - Download wheels from cache
# cache-rocm-base-wheels.sh key - Output the cache key
#
# Environment variables:
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
#
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
# so changes to ROCm version are captured by the Dockerfile hash.
set -euo pipefail
BUCKET="${S3_BUCKET:-vllm-wheels}"
DOCKERFILE="docker/Dockerfile.rocm_base"
CACHE_PREFIX="rocm/cache"
# Generate hash from Dockerfile content + build args
generate_cache_key() {
# Include Dockerfile content
if [[ ! -f "$DOCKERFILE" ]]; then
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
exit 1
fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
echo "${dockerfile_hash}"
}
CACHE_KEY=$(generate_cache_key)
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
case "${1:-}" in
check)
echo "Checking cache for key: ${CACHE_KEY}" >&2
echo "Cache path: ${CACHE_PATH}" >&2
# Check if cache exists by listing objects
# We look for at least one .whl file
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
echo "S3 ls output:" >&2
echo "$S3_OUTPUT" | head -5 >&2
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
echo "hit"
else
echo "miss"
fi
;;
upload)
echo "========================================"
echo "Uploading wheels to cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
exit 1
fi
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1
fi
echo "Uploading $WHEEL_COUNT wheels..."
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
echo ""
echo "Cache upload complete!"
echo "========================================"
;;
download)
echo "========================================"
echo "Downloading wheels from cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
mkdir -p artifacts/rocm-base-wheels
# Use sync with include/exclude to only download .whl files
aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \
--exclude "*" \
--include "*.whl"
echo ""
echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
echo "========================================"
;;
key)
echo "${CACHE_KEY}"
;;
path)
echo "${CACHE_PATH}"
;;
*)
echo "Usage: $0 {check|upload|download|key|path}" >&2
echo "" >&2
echo "Commands:" >&2
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
echo " key - Output the cache key" >&2
echo " path - Output the full S3 cache path" >&2
exit 1
;;
esac

View File

@@ -1,235 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Check if Ray LLM can generate lock files that are compatible with this
# version of vllm. Downloads Ray's requirement files and runs a full
# dependency resolution with the installed vllm's constraints to see if
# a valid lock file can be produced.
#
# See: https://github.com/vllm-project/vllm/issues/33599
set -eo pipefail
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR=$(mktemp -d)
trap 'rm -rf "$WORK_DIR"' EXIT
# ── Detect PyTorch index URL ─────────────────────────────────────────────
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
TORCH_INDEX_URL="${CANDIDATE_URL}"
else
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
TORCH_INDEX_URL=""
fi
else
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
fi
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
# Fetch all Ray requirement files used in the LLM depset pipeline
echo ">>> Fetching Ray requirement files"
RAY_FILES=(
"requirements.txt"
"requirements/cloud-requirements.txt"
"requirements/base-test-requirements.txt"
"requirements/llm/llm-requirements.txt"
"requirements/llm/llm-test-requirements.txt"
)
for FILE in "${RAY_FILES[@]}"; do
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
echo " ${FILE}"
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
done
# Extract installed vllm deps
echo ">>> Extracting installed vllm dependency constraints"
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
"""Write out the installed vllm's dependencies as pip constraint lines.
Ray uses vllm[audio], so audio-extra deps are included with their extra
markers stripped. The resolver cannot evaluate extra markers for a
package that is not itself being resolved from an index, so we activate
them manually here.
"""
import importlib.metadata
import re
import sys
out_path = sys.argv[1]
raw_reqs = importlib.metadata.requires("vllm") or []
# Ray uses vllm[audio] activate that extra.
ACTIVE_EXTRAS = {"audio"}
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
lines = []
for r in raw_reqs:
if ";" not in r:
# Unconditional dep — always include.
lines.append(r.strip())
continue
req_part, _, marker_part = r.partition(";")
marker_part = marker_part.strip()
extra_matches = EXTRA_RE.findall(marker_part)
if not extra_matches:
# Non-extra marker (python_version, etc.) — keep as-is.
lines.append(r.strip())
continue
if not ACTIVE_EXTRAS.intersection(extra_matches):
continue # Skip inactive extras (tensorizer, bench, …).
# Strip the extra== conditions but keep any remaining markers
# (e.g. python_version).
cleaned = EXTRA_RE.sub("", marker_part)
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
if cleaned:
lines.append(f"{req_part.strip()} ; {cleaned}")
else:
lines.append(req_part.strip())
with open(out_path, "w") as f:
for line in lines:
f.write(line + "\n")
print(f"Wrote {len(lines)} constraints to {out_path}")
PYEOF
echo ">>> Installed vllm deps (first 20 lines):"
head -20 "${WORK_DIR}/vllm-constraints.txt"
# Remove Ray's vllm pin — the installed vllm's transitive deps
# (written above) replace it in the resolution. vllm itself cannot
# be resolved from PyPI for in-development versions, so we test
# whether Ray's requirements can coexist with vllm's dependency
# constraints instead.
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
# Install uv if needed
if ! command -v uv &>/dev/null; then
echo ">>> Installing uv"
pip install uv -q
fi
# Resolve: given vllm's constraints, can Ray compile a lock file?
#
# vllm's dependency constraints are the fixed side — Ray is flexible and
# can regenerate its lock files. We pass vllm's constraints via -c so
# the resolver treats them as non-negotiable bounds, then check whether
# Ray's own requirements can still be satisfied within those bounds.
echo ""
echo "============================================================"
echo ">>> Resolving: Can Ray generate compatible lock files?"
echo "============================================================"
EXTRA_INDEX_ARGS=()
if [[ -n "${TORCH_INDEX_URL}" ]]; then
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
fi
set +e
uv pip compile \
"${WORK_DIR}/requirements.txt" \
"${WORK_DIR}/cloud-requirements.txt" \
"${WORK_DIR}/base-test-requirements.txt" \
"${WORK_DIR}/llm-requirements.txt" \
"${WORK_DIR}/llm-test-requirements.txt" \
-c "${WORK_DIR}/vllm-constraints.txt" \
--python-version 3.12 \
--python-platform x86_64-manylinux_2_31 \
"${EXTRA_INDEX_ARGS[@]}" \
--index-strategy unsafe-best-match \
--unsafe-package setuptools \
--unsafe-package ray \
--no-header \
-o "${WORK_DIR}/resolved.txt" \
2>&1
EXIT_CODE=$?
set -e
echo ""
echo "=========================================="
if [ $EXIT_CODE -eq 0 ]; then
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
echo ""
echo "Key resolved versions:"
grep -E '^(protobuf|torch|numpy|transformers)==' \
"${WORK_DIR}/resolved.txt" | sort || true
echo "=========================================="
exit 0
fi
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
echo "This means a fundamental dependency conflict exists that Ray"
echo "cannot resolve by regenerating its lock files."
echo "See: https://github.com/vllm-project/vllm/issues/33599"
echo "=========================================="
# Buildkite annotation
if [ -f /usr/bin/buildkite-agent ]; then
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
### :warning: Ray Dependency Compatibility Warning
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
Ray would not be able to regenerate its lock files to accommodate this vllm version.
Please check the **Ray Dependency Compatibility Check** step logs for details.
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
EOF
fi
# Notify Slack if webhook is configured and PR/branch are valid.
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
PR="${BUILDKITE_PULL_REQUEST:-}"
BRANCH="${BUILDKITE_BRANCH:-}"
# Skip notification if PR is invalid or branch is empty
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
else
echo ">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD=$(python3 -c '
import json, os, sys
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
url = os.getenv("BUILDKITE_BUILD_URL", "#")
data = {
"text": ":warning: Ray Dependency Compatibility Check Failed",
"blocks": [{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
"*:warning: Ray Dependency Compatibility Check Failed*\n"
f"PR #{pr} on branch `{branch}` introduces dependencies "
f"that cannot be resolved with Ray'\''s requirements.\n"
f"<{url}|View Build>"
),
},
}],
}
print(json.dumps(data))
')
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-H 'Content-type: application/json' \
-d "$PAYLOAD")
echo " Slack webhook response: $HTTP_CODE"
fi
else
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
fi
exit 1

View File

@@ -1,242 +0,0 @@
#!/bin/bash
#
# cherry-pick-from-milestone.sh
# Find commits from a GitHub milestone that are missing from the current branch
# and output them in chronological order for cherry-picking.
#
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
usage() {
cat <<EOF
Usage: $(basename "$0") <milestone> [options]
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
Arguments:
milestone The GitHub milestone name (e.g., v0.14.0)
Options:
--dry-run Show the cherry-pick commands without executing (default)
--execute Actually execute the cherry-picks
--main-branch Specify the main branch name (default: main)
--help Show this help message
Examples:
$(basename "$0") v0.14.0
$(basename "$0") v0.14.0 --dry-run
$(basename "$0") v0.14.0 --execute
$(basename "$0") v0.14.0 --main-branch master
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Default values
MILESTONE=""
DRY_RUN=true
MAIN_BRANCH="main"
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
shift
;;
--execute)
DRY_RUN=false
shift
;;
--main-branch)
MAIN_BRANCH="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
if [[ -z "$MILESTONE" ]]; then
MILESTONE="$1"
else
log_error "Unexpected argument: $1"
usage
fi
shift
;;
esac
done
# Validate milestone argument
if [[ -z "$MILESTONE" ]]; then
log_error "Milestone is required"
usage
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Check if gh CLI is available
if ! command -v gh &>/dev/null; then
log_error "GitHub CLI (gh) is not installed"
exit 1
fi
# Check if authenticated with gh
if ! gh auth status &>/dev/null; then
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
exit 1
fi
CURRENT_BRANCH=$(git branch --show-current)
log_info "Current branch: ${CURRENT_BRANCH}"
log_info "Main branch: ${MAIN_BRANCH}"
log_info "Milestone: ${MILESTONE}"
echo ""
# Fetch latest from remote
log_info "Fetching latest from remote..."
git fetch origin "$MAIN_BRANCH" --quiet
# Get merged PRs from the milestone, sorted by merge date
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file
PR_DATA=$(mktemp)
trap 'rm -f "$PR_DATA"' EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \
--json number,title,mergeCommit,mergedAt \
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
log_error "This could be due to:"
log_error " - Milestone does not exist"
log_error " - Network/authentication issues"
log_error " - Invalid milestone name format"
exit 1
fi
if [[ ! -s "$PR_DATA" ]]; then
log_warn "No merged PRs found for milestone '${MILESTONE}'"
exit 0
fi
TOTAL_PRS=$(wc -l < "$PR_DATA")
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
echo ""
# Find commits that are missing from current branch
MISSING_COMMITS=()
MISSING_INFO=()
while IFS=$'\t' read -r sha pr_number title; do
# Skip if SHA is empty or null
if [[ -z "$sha" || "$sha" == "null" ]]; then
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
continue
fi
# Check if this commit is already in the current branch
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
log_success "PR #${pr_number} already in branch: ${title:0:60}"
else
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
MISSING_COMMITS+=("$sha")
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
fi
done < "$PR_DATA"
echo ""
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
exit 0
fi
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
echo ""
# Output the cherry-pick commands
echo "=========================================="
echo "Cherry-pick commands (in chronological order):"
echo "=========================================="
echo ""
for info in "${MISSING_INFO[@]}"; do
echo "# $info"
done
echo ""
echo "# Run these commands to cherry-pick all missing commits:"
echo "git cherry-pick ${MISSING_COMMITS[*]}"
echo ""
# Or one by one
echo "# Or cherry-pick one at a time:"
for sha in "${MISSING_COMMITS[@]}"; do
echo "git cherry-pick $sha"
done
echo ""
# Execute if requested
if [[ "$DRY_RUN" == false ]]; then
echo "=========================================="
log_info "Executing cherry-picks..."
echo "=========================================="
for i in "${!MISSING_COMMITS[@]}"; do
sha="${MISSING_COMMITS[$i]}"
info="${MISSING_INFO[$i]}"
echo ""
log_info "Cherry-picking: $info"
if git cherry-pick "$sha"; then
log_success "Successfully cherry-picked $sha"
else
log_error "Failed to cherry-pick $sha"
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
exit 1
fi
done
echo ""
log_success "All cherry-picks completed successfully!"
else
echo "=========================================="
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
echo "Run with --execute to perform the cherry-picks"
echo "=========================================="
fi

View File

@@ -3,20 +3,10 @@
set -ex set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix # This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
# Example: cleanup-nightly-builds.sh "nightly-"
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"
# Get tag prefix and repo from arguments # DockerHub API endpoint for vllm/vllm-openai repository
TAG_PREFIX="${1:-nightly-}" REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
REPO="${2:-vllm/vllm-openai}"
echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"
# DockerHub API endpoint for the repository
REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"
# Get DockerHub credentials from environment # Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then if [ -z "$DOCKERHUB_TOKEN" ]; then
@@ -55,7 +45,7 @@ get_all_tags() {
set -x set -x
# Get both last_updated timestamp and tag name, separated by | # Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"') local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then if [ -z "$tags" ]; then
break break
@@ -73,7 +63,7 @@ delete_tag() {
local tag_name="$1" local tag_name="$1"
echo "Deleting tag: $tag_name" echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name" local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
set +x set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x set -x

View File

@@ -1,84 +0,0 @@
#!/usr/bin/env bash
set -ex
# Generate and upload wheel indices for all wheels in the commit directory.
# This script should run once after all wheels have been built and uploaded.
# ======== setup ========
BUCKET="vllm-wheels"
INDICES_OUTPUT_DIR="indices"
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
PYTHON="${PYTHON_PROG:-python3}" # try to read from env var, otherwise use python3
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
# detect if python3.12+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
if [[ "$has_new_python" -eq 0 ]]; then
# use new python from docker
docker pull python:3-slim
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ======== generate and upload indices ========
# list all wheels in the commit directory
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indices for all existing wheels
# these indices have relative paths that work as long as they are next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
alias_args=()
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
fi
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# copy to /nightly/ only if it is on the main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
echo "Uploading indices to overwrite /nightly/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
# detect version from any wheel in the commit directory
# download the first wheel we find to extract version metadata
first_wheel_key=$($PYTHON -c "import json; obj=json.load(open('$obj_json')); print(next((c['Key'] for c in obj.get('Contents', []) if c['Key'].endswith('.whl')), ''))")
if [[ -z "$first_wheel_key" ]]; then
echo "Error: No wheels found in $S3_COMMIT_PREFIX"
exit 1
fi
first_wheel=$(basename "$first_wheel_key")
aws s3 cp "s3://$BUCKET/${first_wheel_key}" "/tmp/${first_wheel}"
version=$(unzip -p "/tmp/${first_wheel}" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
rm -f "/tmp/${first_wheel}"
echo "Version in wheel: $version"
pure_version="${version%%+*}"
echo "Pure version (without variant): $pure_version"
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "${INDICES_OUTPUT_DIR:?}"
mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@@ -1,468 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# do not complain about line length (for docstring)
# ruff: noqa: E501
import argparse
import json
import sys
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import quote
import regex as re
def normalize_package_name(name: str) -> str:
"""
Normalize package name according to PEP 503.
https://peps.python.org/pep-0503/#normalized-names
Replace runs of underscores, hyphens, and periods with a single hyphen,
and lowercase the result.
"""
return re.sub(r"[-_.]+", "-", name).lower()
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<!-- {comment} -->
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
</body>
</html>
"""
@dataclass
class WheelFileInfo:
package_name: str
version: str
build_tag: str | None
python_tag: str
abi_tag: str
platform_tag: str
variant: str | None
filename: str
def parse_from_filename(file: str) -> WheelFileInfo:
"""
Parse wheel file name to extract metadata.
The format of wheel names:
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
Example:
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
"""
wheel_file_re = re.compile(
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
)
match = wheel_file_re.match(file)
if not match:
raise ValueError(f"Invalid wheel file name: {file}")
package_name = match.group("package_name")
version = match.group("version")
build_tag = match.group("build_tag")
python_tag = match.group("python_tag")
abi_tag = match.group("abi_tag")
platform_tag = match.group("platform_tag")
# extract variant from version
variant = None
if "dev" in version:
ver_after_dev = version.split("dev")[-1]
if "." in ver_after_dev:
variant = ver_after_dev.split(".")[-1]
version = version.removesuffix("." + variant)
else:
if "+" in version:
version_part, suffix = version.split("+", 1)
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
# Git hashes and other suffixes are NOT variants
if suffix.startswith(("rocm", "cu", "cpu")):
variant = suffix
version = version_part
# Otherwise keep the full version string (variant stays None)
return WheelFileInfo(
package_name=package_name,
version=version,
build_tag=build_tag,
python_tag=python_tag,
abi_tag=abi_tag,
platform_tag=platform_tag,
variant=variant,
filename=file,
)
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant subdirectory.
"""
href_tags = []
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo],
wheel_base_dir: Path,
index_base_dir: Path,
comment: str = "",
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
"""
href_tags = []
metadata = []
for file in sorted(wheel_files, key=lambda x: x.filename):
relative_path = (
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
)
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
# NOTE: this is AWS S3 specific behavior!
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
def generate_index_and_metadata(
whl_files: list[str],
wheel_base_dir: Path,
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
comment: str = "",
):
"""
Generate index for all wheel files.
Args:
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
wheel_base_dir (Path): Base directory for wheel files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant subdirectory
index.html
vllm/
index.html
metadata.json
cu129/ # cu129 is actually the alias to default variant
index.html
vllm/
index.html
metadata.json
cu130/ # cu130 variant subdirectory
index.html
vllm/
index.html
metadata.json
...
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
[
{
"package_name": "vllm",
"version": "0.10.2rc2",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux2014_aarch64",
"variant": "cu129",
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
},
...
]
"""
parsed_files = [parse_from_filename(f) for f in whl_files]
if not parsed_files:
print("No wheel files found, skipping index generation.")
return
# For ROCm builds: inherit variant from vllm wheel
# All ROCm wheels should share the same variant as vllm
rocm_variant = None
for file in parsed_files:
if (
file.package_name == "vllm"
and file.variant
and file.variant.startswith("rocm")
):
rocm_variant = file.variant
print(f"Detected ROCm variant from vllm: {rocm_variant}")
break
# Apply ROCm variant to all wheels without a variant
if rocm_variant:
for file in parsed_files:
if file.variant is None:
file.variant = rocm_variant
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
variant = file.variant or "default"
if variant not in variant_to_files:
variant_to_files[variant] = []
variant_to_files[variant].append(file)
print(f"Found variants: {list(variant_to_files.keys())}")
# sanity check for default variant
if default_variant:
if "default" in variant_to_files:
raise ValueError(
"All wheel files must have variant suffixes when `default_variant` is specified."
)
if default_variant not in variant_to_files:
raise ValueError(
f"Default variant '{default_variant}' not found among wheel files."
)
if alias_to_default:
if "default" not in variant_to_files:
# e.g. only some wheels are uploaded to S3 currently
print(
"[WARN] Alias to default variant specified, but no default variant found."
)
elif alias_to_default in variant_to_files:
raise ValueError(
f"Alias variant name '{alias_to_default}' already exists among wheel files."
)
else:
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate comment in HTML header
comment_str = f" ({comment})" if comment else ""
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
if variant == "default":
variant_dir = index_base_dir
else:
variant_dir = index_base_dir / variant
subdir_names.add(variant)
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant (normalized per PEP 503)
packages = set(normalize_package_name(f.package_name) for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only (compare normalized names)
package_files = [
f for f in files if normalize_package_name(f.package_name) == package
]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir, comment
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
with open(package_dir / "metadata.json", "w") as f:
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--comment <comment_string> : (optional) comment string to include in generated HTML files
"""
parser = argparse.ArgumentParser(
description="Process nightly build wheel files to generate indices."
)
parser.add_argument(
"--version",
type=str,
required=True,
help="Version string for the current build (e.g., commit hash)",
)
parser.add_argument(
"--current-objects",
type=str,
required=True,
help="Path to JSON file containing current S3 objects listing in this version directory",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--wheel-dir",
type=str,
default=None,
help="Directory containing wheel files (default to be same as `version`)",
)
parser.add_argument(
"--alias-to-default",
type=str,
default=None,
help="Alias variant name for the default variant",
)
parser.add_argument(
"--comment",
type=str,
default="",
help="Optional comment string to include in generated HTML files",
)
args = parser.parse_args()
version = args.version
# Allow rocm/ prefix, reject other slashes and all backslashes
if "\\" in version:
raise ValueError("Version string must not contain backslashes.")
if "/" in version and not version.startswith("rocm/"):
raise ValueError(
"Version string must not contain slashes (except for 'rocm/' prefix)."
)
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
# Read current objects JSON
with open(current_objects_path) as f:
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
# current_objects looks like from list_objects_v2 S3 API:
"""
"Contents": [
{
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
"LastModified": "2025-11-28T14:00:32+00:00",
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
"ChecksumAlgorithm": [
"CRC64NVME"
],
"ChecksumType": "FULL_OBJECT",
"Size": 435649349,
"StorageClass": "STANDARD"
},
...
]
"""
# Extract wheel file keys
wheel_files = []
for item in current_objects.get("Contents", []):
key: str = item["Key"]
if key.endswith(".whl"):
wheel_files.append(key.split("/")[-1]) # only the filename is used
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specified by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version
wheel_files = list(
filter(lambda x: version in x and "dev" not in x, wheel_files)
)
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
else:
print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
#
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
# - rocm/{commit}/ (same as wheels)
# - rocm/nightly/
# - rocm/{version}/
# All these are under the "rocm/" prefix, so relative paths should be
# relative to "rocm/", not the bucket root.
if args.wheel_dir:
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
wheel_dir = args.wheel_dir.strip().rstrip("/")
elif version.startswith("rocm/"):
# For rocm/commit, wheel_base_dir should be just the commit part
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
wheel_dir = version.split("/", 1)[1]
else:
wheel_dir = version
wheel_base_dir = Path(output_dir).parent / wheel_dir
index_base_dir = Path(output_dir)
generate_index_and_metadata(
whl_files=wheel_files,
wheel_base_dir=wheel_base_dir,
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
comment=args.comment.strip(),
)
print(f"Successfully generated index and metadata in {output_dir}")

View File

@@ -1,57 +1,25 @@
#!/bin/bash #!/bin/bash
# This script runs tests inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
#
###############################################################################
# QUOTING / COMMAND PASSING
#
# Passing commands as positional arguments ($*) is fragile when the command
# string itself contains double quotes, e.g.:
#
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
#
# The outer shell resolves the nested quotes *before* this script runs, so
# the script receives mangled input it cannot fully recover.
#
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
#
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
# bash run-amd-test.sh
#
# Single-quoted assignment preserves all inner double quotes verbatim.
# The $* path is kept for backward compatibility but callers should migrate.
###############################################################################
set -o pipefail set -o pipefail
# Export Python path # Export Python path
export PYTHONPATH=".." export PYTHONPATH=".."
############################################################################### # Print ROCm version
# Helper Functions echo "--- Confirming Clean Initial State"
############################################################################### while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
wait_for_clean_gpus() { echo "--- ROCm info"
local timeout=${1:-300} rocminfo
local start=$SECONDS
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
return
fi
if (( SECONDS - start >= timeout )); then
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
exit 1
fi
sleep 3
done
}
# cleanup older docker images
cleanup_docker() { cleanup_docker() {
# Get Docker's root directory # Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}') docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
exit 1 exit 1
fi fi
echo "Docker root directory: $docker_root" echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70 threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed." echo "Docker images and volumes cleanup completed."
else else
@@ -73,450 +44,193 @@ cleanup_docker() {
fi fi
} }
cleanup_network() { # Call the cleanup docker function
local max_nodes=${NUM_NODES:-2}
for node in $(seq 0 $((max_nodes - 1))); do
if docker ps -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}" || true
fi
done
if docker network ls | grep -q docker-net; then
docker network rm docker-net || true
fi
}
is_multi_node() {
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
return 0
fi
return 1
}
handle_pytest_exit() {
local exit_code=$1
if [ "$exit_code" -eq 5 ]; then
echo "Pytest exit code 5 (no tests collected) - treating as success."
exit 0
fi
exit "$exit_code"
}
###############################################################################
# Pytest marker/keyword re-quoting
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
# quotes around multi-word pytest -m/-k expressions get stripped:
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
#
# This function detects unquoted expressions after -m/-k and re-quotes them
# by collecting tokens until a recognizable boundary is reached:
# - test path (contains '/')
# - test file (ends with '.py')
# - another pytest flag (--xxx or -x single-char flags)
# - command separator (&& || ; |)
# - environment variable assignment (FOO=bar)
#
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
# unquoted since they have no spaces and work fine.
#
# Already-quoted expressions (containing literal single quotes) are passed
# through untouched to avoid double-quoting values injected by
# apply_rocm_test_overrides.
#
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
# double-quotes stripped by the calling shell (see header comment).
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
###############################################################################
re_quote_pytest_markers() {
local input="$1"
local output=""
local collecting=false
local marker_buf=""
# Strip backslash-newline continuations, then flatten remaining newlines
local flat="${input//$'\\\n'/ }"
flat="${flat//$'\n'/ }"
# Disable globbing to prevent *.py etc. from expanding during read -ra
local restore_glob
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
set -o noglob
local -a words
read -ra words <<< "$flat"
eval "$restore_glob"
for word in "${words[@]}"; do
if $collecting; then
# If the token we're about to collect already contains a literal
# single quote, the expression was already quoted upstream.
# Flush and stop collecting.
if [[ "$word" == *"'"* ]]; then
if [[ -n "$marker_buf" ]]; then
# Should not normally happen (partial buf + quote), flush raw
output+="${marker_buf} "
marker_buf=""
fi
output+="${word} "
collecting=false
continue
fi
local is_boundary=false
case "$word" in
# Line-continuation artifact
"\\")
is_boundary=true ;;
# Command separators
"&&"|"||"|";"|"|")
is_boundary=true ;;
# Long flags (--ignore, --shard-id, etc.)
--*)
is_boundary=true ;;
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
# like "not" which don't start with "-". Also skip -k/-m which
# would start a new marker (handled below).
-[a-zA-Z])
is_boundary=true ;;
# Test path (contains /)
*/*)
is_boundary=true ;;
# Test file (ends with .py, possibly with ::method)
*.py|*.py::*)
is_boundary=true ;;
# Environment variable assignment preceding a command (FOO=bar)
*=*)
# Only treat as boundary if it looks like VAR=value, not
# pytest filter expressions like num_gpus=2 inside markers
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
is_boundary=true
fi
;;
esac
if $is_boundary; then
# Strip surrounding double quotes if present (from upstream
# single-to-double conversion); without this, wrapping below
# would produce '"expr"' with literal double-quote characters.
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
# Flush the collected marker expression
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}' "
else
output+="${marker_buf} "
fi
collecting=false
marker_buf=""
# Check if this boundary word itself starts a new -m/-k
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
# Drop stray backslash tokens silently
elif [[ "$word" == "\\" ]]; then
:
else
output+="${word} "
fi
else
# Accumulate into marker buffer
if [[ -n "$marker_buf" ]]; then
marker_buf+=" ${word}"
else
marker_buf="${word}"
fi
fi
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
marker_buf=""
else
output+="${word} "
fi
done
# Flush any trailing marker expression (marker at end of command)
if $collecting && [[ -n "$marker_buf" ]]; then
# Strip surrounding double quotes (see mid-stream flush comment)
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}'"
else
output+="${marker_buf}"
fi
fi
echo "${output% }"
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides() {
local cmds="$1"
# --- Model registry filter ---
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
# --- LoRA: disable custom paged attention ---
if [[ $cmds == *"pytest -v -s lora"* ]]; then
cmds=${cmds//"pytest -v -s lora"/"pytest -v -s lora"}
fi
# --- Kernel ignores ---
if [[ $cmds == *" kernels/core"* ]]; then
cmds="${cmds} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $cmds == *" kernels/attention"* ]]; then
cmds="${cmds} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $cmds == *" kernels/quantization"* ]]; then
cmds="${cmds} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $cmds == *" kernels/mamba"* ]]; then
cmds="${cmds} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $cmds == *" kernels/moe"* ]]; then
cmds="${cmds} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py"
fi
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/chat_completion/test_audio.py \
--ignore=entrypoints/openai/completion/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/models/test_models.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/serve"* ]]; then
cmds="${cmds} \
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# Clean up escaped newlines from --ignore appends
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
echo "$cmds"
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
cleanup_docker cleanup_docker
echo "--- Resetting GPUs" echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
wait_for_clean_gpus
# --- Pull test image --- echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container" echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}" docker pull "${image_name}"
remove_docker_container() { remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
# --- Prepare commands ---
echo "--- Running container" echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
# ---- Command source selection ---- commands=$@
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact). echo "Commands:$commands"
# Fall back to $* for backward compatibility, but warn that inner
# double-quotes will have been stripped by the calling shell. commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
commands="${VLLM_TEST_COMMANDS}" if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)" commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
else
commands="$*"
if [[ -z "$commands" ]]; then
echo "Error: No test commands provided." >&2
echo "Usage:" >&2
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
echo " Legacy: bash $0 \"commands here\"" >&2
exit 1
fi
echo "Commands sourced from positional args (legacy mode)"
echo "WARNING: Inner double-quotes in the command string may have been"
echo " stripped by the calling shell. If you see syntax errors, switch to:"
echo " export VLLM_TEST_COMMANDS='your commands here'"
echo " bash $0"
fi fi
echo "Raw commands: $commands" commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
# Fix quoting before ROCm overrides (so overrides see correct structure) if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=$(re_quote_pytest_markers "$commands") commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
echo "After re-quoting: $commands" fi
commands=$(apply_rocm_test_overrides "$commands") #ignore certain kernels tests
echo "Final commands: $commands" if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".." MYPYTHONPATH=".."
# Verify GPU access # Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3) render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2 echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1 exit 1
fi fi
# --- RDMA device passthrough (conditional) --- # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
# If the host has RDMA devices, pass them through so tests like if [[ $commands == *"--shard-id="* ]]; then
# test_moriio_connector can access ibverbs. On hosts without RDMA # assign job count as the number of shards used
# hardware the tests will gracefully skip via _rdma_available(). commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
RDMA_FLAGS="" for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
if [ -d /dev/infiniband ]; then # assign shard-id for each shard
echo "RDMA devices detected on host, enabling passthrough" commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK" echo "Shard ${GPU} commands:$commands_gpu"
else echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
echo "No RDMA devices found on host, RDMA tests will be skipped" docker run \
fi --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
# --- Route: multi-node vs single-node --- --shm-size=16gb \
if is_multi_node "$commands"; then --group-add "$render_gid" \
echo "--- Multi-node job detected" --rm \
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') -e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds] -e AWS_ACCESS_KEY_ID \
# BASH_REMATCH[1] = prefix (everything before first bracket) -e AWS_SECRET_ACCESS_KEY \
# BASH_REMATCH[2] = comma-separated node0 commands -v "${HF_CACHE}:${HF_MOUNT}" \
# BASH_REMATCH[3] = comma-separated node1 commands -e "HF_HOME=${HF_MOUNT}" \
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then -e "PYTHONPATH=${MYPYTHONPATH}" \
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g') --name "${container_name}_${GPU}" \
echo "PREFIX: ${prefix}" "${image_name}" \
/bin/bash -c "${commands_gpu}" \
export composite_command="(command rocm-smi || true)" |& while read -r line; do echo ">>Shard $GPU: $line"; done &
saved_IFS=$IFS PIDS+=($!)
IFS=',' done
read -ra node0 <<< "${BASH_REMATCH[2]}" #wait for all processes to finish and collect exit codes
read -ra node1 <<< "${BASH_REMATCH[3]}" for pid in "${PIDS[@]}"; do
IFS=$saved_IFS wait "${pid}"
STATUS+=($?)
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then done
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index." for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
fi fi
done
for i in "${!node0[@]}"; do
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${step_cmd}"
composite_command="${composite_command} && ${step_cmd}"
done
/bin/bash -c "${composite_command}"
exit_code=$?
cleanup_network
handle_pytest_exit "$exit_code"
else
echo "Multi-node job detected but failed to parse bracket command syntax."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
echo "Got: $commands"
cleanup_network
exit 111
fi
else else
echo "--- Single-node job"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
$RDMA_FLAGS \ --network=host \
--network=host \ --shm-size=16gb \
--shm-size=16gb \ --group-add "$render_gid" \
--group-add "$render_gid" \ --rm \
--rm \ -e HF_TOKEN \
-e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \
-e AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY \
-e AWS_SECRET_ACCESS_KEY \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e BUILDKITE_PARALLEL_JOB \ -e "HF_HOME=${HF_MOUNT}" \
-e BUILDKITE_PARALLEL_JOB_COUNT \ -e "PYTHONPATH=${MYPYTHONPATH}" \
-v "${HF_CACHE}:${HF_MOUNT}" \ --name "${container_name}" \
-e "HF_HOME=${HF_MOUNT}" \ "${image_name}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \ /bin/bash -c "${commands}"
-e "PYTORCH_ROCM_ARCH=" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
exit_code=$?
handle_pytest_exit "$exit_code"
fi fi

View File

@@ -1,65 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_KVCACHE_SPACE=1
export VLLM_CPU_CI_ENV=1
# Reduce sub-processes for acceleration
export TORCH_COMPILE_DISABLE=1
export VLLM_ENABLE_V1_MULTIPROCESSING=0
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
mkdir -p sde
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
wait_for_pid_and_check_log() {
local pid="$1"
local log_file="$2"
local exit_status
if [ -z "$pid" ] || [ -z "$log_file" ]; then
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return 1
fi
echo "Waiting for process $pid to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if wait "$pid"; then
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Success)."
else
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Failure)."
fi
if [ "$exit_status" -ne 0 ]; then
echo "Process exited with a non-zero status."
echo "--- Last few lines of log file: $log_file ---"
tail -n 50 "$log_file"
echo "---------------------------------------------"
return 1 # Indicate failure based on exit status
fi
echo "No errors detected in log file and process exited successfully."
return 0
}
# Test Sky Lake (AVX512F)
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
PID_TEST_0=$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
PID_TEST_1=$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
PID_TEST_2=$!
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
wait_for_pid_and_check_log $PID_TEST_2 test_2.log

View File

@@ -1,44 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_CI_ENV=0
export VLLM_CPU_KVCACHE_SPACE=1 # avoid OOM
echo "--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 --max-model-len=4096 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--result-dir ./test_results \
--result-filename tp_pp.json \
--save-result \
--endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi
echo "--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--result-dir ./test_results \
--result-filename dp_pp.json \
--save-result \
--endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi

View File

@@ -1,73 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-31}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-31}
export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
# Run the image
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test bash -c "
set -e
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
# Run quantized model tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Run kernel tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2h bash -c cpu_tests

View File

@@ -25,25 +25,23 @@ function cpu_tests() {
# offline inference # offline inference
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -evx set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator tblib pip install sentence-transformers datamodel_code_generator
# Note: disable Bart until supports V1 # Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -2,19 +2,119 @@
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -euox pipefail set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
IMAGE_NAME="cpu-test-$NUMA_NODE"
TIMEOUT_VAL=$1
TEST_COMMAND=$2
# building the docker image export CMAKE_BUILD_PARALLEL_LEVEL=32
echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . # Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \ docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/lora/test_qwen2vl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
' '

View File

@@ -1,49 +1,21 @@
#!/bin/bash #!/bin/bash
# This script builds the HPU docker image and runs the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
#
# vllm-gaudi compatibility pinning:
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
# When upstream vllm changes its API, the plugin may break before it has been updated.
# To handle this, the vllm-gaudi repository maintains a file:
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
# The first line of that file controls what version of vllm is used inside the Docker image:
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
# the test to a known-compatible baseline.
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
set -exuo pipefail set -exuo pipefail
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
VLLM_COMMUNITY_COMMIT=$(curl -s \
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
| head -1 | tr -d '\n')
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
# Try building the docker image # Try building the docker image
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}" cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t "${image_name}" -f - .
FROM gaudi-base-image:latest FROM gaudi-base-image:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
# the current checkout (the Buildkite CI commit) is used unchanged.
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
fi
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1 ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)' RUN VLLM_TARGET_DEVICE=empty pip install .
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
# install development dependencies (for testing) # install development dependencies (for testing)
@@ -64,20 +36,15 @@ EOF
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_containers() { docker rm -f "${container_name}" || true; } remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers remove_docker_containers
echo "Running HPU plugin v1 test" echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name="${container_name}" --network=host \ docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
-e HABANA_VISIBLE_DEVICES=all \ -e HABANA_VISIBLE_DEVICES=all \
-e VLLM_SKIP_WARMUP=true \ hpu-plugin-v1-test-env \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
-e PT_HPU_LAZY_MODE=1 \
"${image_name}" \
/bin/bash -c '
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
'
EXITCODE=$? EXITCODE=$?
if [ $EXITCODE -eq 0 ]; then if [ $EXITCODE -eq 0 ]; then

View File

@@ -1,276 +0,0 @@
#!/bin/bash
# This script runs tests inside the Intel XPU docker container.
# It mirrors the structure of run-amd-test.sh while keeping Intel-specific
# container setup and allowing commands to be sourced from YAML or env.
#
# Command sources (in priority order):
# 1) VLLM_TEST_COMMANDS env var (preferred, preserves quoting)
# 2) Positional args (legacy)
# 3) One or more YAML files with a commands list (test-area style)
###############################################################################
set -o pipefail
DRY_RUN=${DRY_RUN:-0}
if [[ "${1:-}" == "--dry-run" ]]; then
DRY_RUN=1
shift
fi
# Export Python path
export PYTHONPATH=".."
###############################################################################
# Helper Functions
###############################################################################
cleanup_docker() {
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory." >&2
exit 1
fi
echo "Docker root directory: $docker_root"
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
docker image prune -f
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
re_quote_pytest_markers() {
local input="$1"
local output=""
local collecting=false
local marker_buf=""
local flat="${input//$'\n'/ }"
local restore_glob
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
set -o noglob
local -a words
read -ra words <<< "$flat"
eval "$restore_glob"
for word in "${words[@]}"; do
if $collecting; then
if [[ "$word" == *"'"* ]]; then
if [[ -n "$marker_buf" ]]; then
output+="${marker_buf} "
marker_buf=""
fi
output+="${word} "
collecting=false
continue
fi
local is_boundary=false
case "$word" in
"&&"|"||"|";"|"|")
is_boundary=true ;;
--*)
is_boundary=true ;;
-[a-zA-Z])
is_boundary=true ;;
*/*)
is_boundary=true ;;
*.py|*.py::*)
is_boundary=true ;;
*=*)
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
is_boundary=true
fi
;;
esac
if $is_boundary; then
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}' "
else
output+="${marker_buf} "
fi
collecting=false
marker_buf=""
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
else
output+="${word} "
fi
else
if [[ -n "$marker_buf" ]]; then
marker_buf+=" ${word}"
else
marker_buf="${word}"
fi
fi
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
marker_buf=""
else
output+="${word} "
fi
done
if $collecting && [[ -n "$marker_buf" ]]; then
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}'"
else
output+="${marker_buf}"
fi
fi
echo "${output% }"
}
apply_intel_test_overrides() {
local cmds="$1"
# Placeholder for Intel-specific exclusions/overrides.
echo "$cmds"
}
is_yaml_file() {
local p="$1"
[[ -f "$p" && "$p" == *.yaml ]]
}
extract_yaml_commands() {
local yaml_path="$1"
awk '
$1 == "commands:" { in_cmds=1; next }
in_cmds && $0 ~ /^[[:space:]]*-[[:space:]]/ {
sub(/^[[:space:]]*-[[:space:]]/, "");
print;
next
}
in_cmds && $0 ~ /^[^[:space:]]/ { exit }
' "$yaml_path"
}
###############################################################################
# Main
###############################################################################
default_image_name="${REGISTRY}/${REPO}:${BUILDKITE_COMMIT}-xpu"
#default_image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-xpu"
image_name="${IMAGE_TAG_XPU:-${default_image_name}}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# ---- Command source selection ----
commands=""
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
commands="${VLLM_TEST_COMMANDS}"
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
elif [[ $# -gt 0 ]]; then
all_yaml=true
for arg in "$@"; do
if ! is_yaml_file "$arg"; then
all_yaml=false
break
fi
done
if $all_yaml; then
for yaml in "$@"; do
mapfile -t COMMANDS < <(extract_yaml_commands "$yaml")
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
echo "Error: No commands found in ${yaml}" >&2
exit 1
fi
for cmd in "${COMMANDS[@]}"; do
if [[ -z "$commands" ]]; then
commands="${cmd}"
else
commands+=" && ${cmd}"
fi
done
done
echo "Commands sourced from YAML files: $*"
else
commands="$*"
echo "Commands sourced from positional args (legacy mode)"
fi
else
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_YAML="${SCRIPT_DIR}/intel-test.yaml"
if [[ ! -f "${DEFAULT_YAML}" ]]; then
echo "Error: YAML file not found: ${DEFAULT_YAML}" >&2
exit 1
fi
mapfile -t COMMANDS < <(extract_yaml_commands "${DEFAULT_YAML}")
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
echo "Error: No commands found in ${DEFAULT_YAML}" >&2
exit 1
fi
for cmd in "${COMMANDS[@]}"; do
if [[ -z "$commands" ]]; then
commands="${cmd}"
else
commands+=" && ${cmd}"
fi
done
echo "Commands sourced from default YAML: ${DEFAULT_YAML}"
fi
if [[ -z "$commands" ]]; then
echo "Error: No test commands provided." >&2
exit 1
fi
echo "Raw commands: $commands"
commands=$(re_quote_pytest_markers "$commands")
echo "After re-quoting: $commands"
commands=$(apply_intel_test_overrides "$commands")
echo "Final commands: $commands"
# Dry-run mode prints final commands and exits before Docker.
if [[ "$DRY_RUN" == "1" ]]; then
echo "DRY_RUN=1 set, skipping Docker execution."
exit 0
fi
# --- Docker housekeeping ---
cleanup_docker
# --- Build or pull test image ---
if [[ -n "${IMAGE_TAG_XPU:-}" ]]; then
echo "Using prebuilt XPU image: ${IMAGE_TAG_XPU}"
docker pull "${IMAGE_TAG_XPU}"
else
echo "Using prebuilt XPU image: ${image_name}"
docker pull "${image_name}"
fi
remove_docker_container() {
docker rm -f "${container_name}" || true
docker image rm -f "${image_name}" || true
docker system prune -f || true
}
trap remove_docker_container EXIT
# --- Single-node job ---
if [[ -z "${ZE_AFFINITY_MASK:-}" ]]; then
echo "Warning: ZE_AFFINITY_MASK is not set. Proceeding without device affinity." >&2
fi
docker run \
--device /dev/dri:/dev/dri \
--net=host \
--ipc=host \
--privileged \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
-e "HF_TOKEN=${HF_TOKEN:-}" \
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-}" \
-e "CMDS=${commands}" \
--name "${container_name}" \
"${image_name}" \
bash -c 'set -e; echo "ZE_AFFINITY_MASK is ${ZE_AFFINITY_MASK:-}"; eval "$CMDS"'

View File

@@ -41,7 +41,6 @@ get_config() {
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1 exit 1
fi fi
# shellcheck source=/dev/null
source "${TEST_RUN_CONFIG_FILE}" source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0 return 0
@@ -49,8 +48,9 @@ get_config() {
# get test running configuration. # get test running configuration.
fetch_vllm_test_cfg fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script. # Check if the function call was successful. If not, exit the script.
if ! get_config; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
@@ -62,19 +62,18 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}" echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}" builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}" builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p "${builder_cache_dir}" mkdir -p ${builder_cache_dir}
# Try building the docker image # Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \ cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \ --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \ --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest="${builder_cache_dir}",mode=max \ --cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t "${image_name}" -f - . --progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME} FROM ${BASE_IMAGE_NAME}
# Define environments # Define environments
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
@@ -116,7 +115,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +138,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. # e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns one argument per line: --device, /dev/davinciX, ... # returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() { parse_and_gen_devices() {
local input="$1" local input="$1"
local index cards_num local index cards_num
@@ -151,24 +150,29 @@ parse_and_gen_devices() {
return 1 return 1
fi fi
local devices=""
local i=0 local i=0
while (( i < cards_num )); do while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i )) local dev_idx=$(((index - 1)*cards_num + i ))
printf '%s\n' "--device" devices="$devices --device /dev/davinci${dev_idx}"
printf '%s\n' "/dev/davinci${dev_idx}"
((i++)) ((i++))
done done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
} }
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with # This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend. # the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx} model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p "${model_cache_dir}" mkdir -p ${model_cache_dir}
docker run \ docker run \
"${device_args[@]}" \ ${devices} \
--device /dev/davinci_manager \ --device /dev/davinci_manager \
--device /dev/devmm_svm \ --device /dev/devmm_svm \
--device /dev/hisi_hdc \ --device /dev/hisi_hdc \
@@ -177,7 +181,7 @@ docker run \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \
-v "${model_cache_dir}":/root/.cache/modelscope \ -v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \ --entrypoint="" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
@@ -127,7 +127,7 @@ run_and_track_test() {
# --- Actual Test Execution --- # --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \ run_and_track_test 1 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \ run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \ run_and_track_test 3 "test_lora.py" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t "${image_name}" -f docker/Dockerfile.xpu . docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
@@ -33,22 +33,19 @@ docker run \
bash -c ' bash -c '
set -e set -e
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/core
pytest -v -s v1/engine pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)" pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
' '

View File

@@ -1,62 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Push ROCm nightly base image and nightly image from ECR
# to Docker Hub as vllm/vllm-openai-rocm:base-nightly and vllm/vllm-openai-rocm:nightly
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
#
# Local testing (no push to Docker Hub):
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.
set -ex
# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
DRY_RUN="${DRY_RUN:-0}"
# Get the base image ECR tag (set by build-rocm-release-image pipeline step)
BASE_ORIG_TAG="$(buildkite-agent meta-data get rocm-base-ecr-tag 2>/dev/null || echo "")"
if [ -z "$BASE_ORIG_TAG" ]; then
echo "WARNING: rocm-base-ecr-tag metadata not found, falling back to commit-based tag"
BASE_ORIG_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
fi
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
BASE_TAG_NAME="base-nightly"
TAG_NAME="nightly"
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"
echo "Pushing ROCm base image from ECR: $BASE_ORIG_TAG"
echo "Pushing ROCm release image from ECR tag: $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"
# Login to ECR and pull the image built by build-rocm-release-image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull "$BASE_ORIG_TAG"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"
# Tag for Docker Hub (base-nightly and base-nightly-<commit>, nightly and nightly-<commit>)
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
if [[ "$DRY_RUN" == "1" ]]; then
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
echo "[DRY_RUN] Local tags created. Exiting without push."
exit 0
fi
# Push to Docker Hub (docker-login plugin runs before this step in CI)
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest push vllm/vllm-openai:"$TAG_NAME"
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"

View File

@@ -2,17 +2,6 @@
set -euox pipefail set -euox pipefail
# To detect ROCm
# Check multiple indicators:
if [ -e /dev/kfd ] || \
[ -d /opt/rocm ] || \
command -v rocm-smi &> /dev/null || \
[ -n "${ROCM_HOME:-}" ]; then
IS_ROCM=1
else
IS_ROCM=0
fi
if [[ $# -lt 4 ]]; then if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1 exit 1
@@ -37,18 +26,13 @@ for command in "${COMMANDS[@]}"; do
echo "$command" echo "$command"
done done
start_network() { start_network() {
docker network create --subnet=192.168.10.0/24 docker-net docker network create --subnet=192.168.10.0/24 docker-net
} }
start_nodes() { start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do for node in $(seq 0 $(($NUM_NODES-1))); do
if [ "$IS_ROCM" -eq 1 ]; then GPU_DEVICES='"device='
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
else
GPU_DEVICES='--gpus "device='
fi
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM)) GPU_DEVICES+=$(($DEVICE_NUM))
@@ -56,9 +40,7 @@ start_nodes() {
GPU_DEVICES+=',' GPU_DEVICES+=','
fi fi
done done
if [ "$IS_ROCM" -eq 0 ]; then GPU_DEVICES+='"'
GPU_DEVICES+='"'
fi
# start the container in detached mode # start the container in detached mode
# things to note: # things to note:
@@ -67,7 +49,7 @@ start_nodes() {
# 3. map the huggingface cache directory to the container # 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11) # starting from 192.168.10.11)
docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \ docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
/bin/bash -c "tail -f /dev/null" /bin/bash -c "tail -f /dev/null"

View File

@@ -0,0 +1,59 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@@ -17,17 +17,7 @@ wait_for_server() {
} }
MODEL="deepseek-ai/DeepSeek-V2-lite" MODEL="deepseek-ai/DeepSeek-V2-lite"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() { cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -43,6 +33,7 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 2 \ --tensor-parallel-size 2 \
@@ -51,14 +42,13 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \ --enable-eplb \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--all2all-backend "$BACK" \ --port $PORT &
--port "$PORT" &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,69 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
#
# Environment variables:
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-Lite"
# ── Build optional vllm serve flags ─────────────────────────────────────
EXTRA_ARGS=()
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
echo "Using attention backend: ${ATTENTION_BACKEND}"
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
vllm serve "$MODEL" \
--max-model-len 2048 \
--offload-group-size 8 \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" \
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
PY
cleanup
SERVER_PID=

View File

@@ -1,12 +1,10 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euxo pipefail set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE] # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.8} THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319} NUM_Q=${2:-1319}
PORT=${3:-8020} PORT=${3:-8020}
DATA_PARALLEL_SIZE=${4:-2}
TENSOR_PARALLEL_SIZE=${5:-2}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}" mkdir -p "${OUT_DIR}"
@@ -19,16 +17,7 @@ wait_for_server() {
} }
MODEL="QWen/Qwen3-30B-A3B-FP8" MODEL="QWen/Qwen3-30B-A3B-FP8"
# Set BACKENDS based on platform BACKENDS=("deepep_high_throughput" "deepep_low_latency")
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() { cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -44,23 +33,21 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-eplb \ --tensor-parallel-size 2 \
--all2all-backend "$BACK" \ --data-parallel-size 2 \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
--enable-expert-parallel \ --enable-expert-parallel \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--port "$PORT" & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,78 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS and platform-specific args based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
PLATFORM_ARGS=()
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend "$BACK" \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@@ -1,248 +0,0 @@
#!/bin/bash
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
# evaluation against a local vLLM server.
#
# Usage:
# # Run with defaults (gpt-oss-20b, multi_turn)
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Run with gpt-oss-120b and multiple test categories
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# Environment variables (all optional, with defaults):
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
# BFCL_PORT - Server port (default: 8000)
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
# BFCL_EXTRA_ARGS - Additional vLLM server args
set -euo pipefail
# ---- Configuration ----
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
TP_SIZE="${BFCL_TP_SIZE:-1}"
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
PORT="${BFCL_PORT:-8000}"
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
# Set up output directory
if [ -n "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
fi
echo "============================================"
echo "BFCL Tool Call Correctness Evaluation"
echo "============================================"
echo "Model: $MODEL"
echo "Tool parser: $TOOL_CALL_PARSER"
echo "API type: $API_TYPE"
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
echo "Test category: $TEST_CATEGORY"
echo "TP size: $TP_SIZE"
echo "Max model len: $MAX_MODEL_LEN"
echo "Port: $PORT"
echo "Num threads: $NUM_THREADS"
echo "============================================"
# ---- Install bfcl-eval if missing ----
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
echo "Installing bfcl-eval..."
pip install "bfcl-eval>=2025.10.20.1,<2026"
fi
# ---- Cleanup handler ----
SERVER_PID=""
cleanup() {
if [ -n "$SERVER_PID" ]; then
echo "Stopping vLLM server (pid=$SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
# Remove BFCL lock files (created by filelock for thread-safe writes)
rm -rf .file_locks/
if [ -n "${OUTPUT_DIR:-}" ]; then
rm -rf "$OUTPUT_DIR/.file_locks/"
fi
}
trap cleanup EXIT
# ---- Start vLLM server ----
echo "Starting vLLM server..."
SERVE_ARGS=(
"$MODEL"
--port "$PORT"
--enable-auto-tool-choice
--tool-call-parser "$TOOL_CALL_PARSER"
--tensor-parallel-size "$TP_SIZE"
--max-model-len "$MAX_MODEL_LEN"
--enforce-eager
--no-enable-prefix-caching
)
# Append reasoning parser if specified
if [ -n "$REASONING_PARSER" ]; then
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
fi
# Append any extra args
if [ -n "$EXTRA_ARGS" ]; then
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
fi
echo "Command: vllm serve ${SERVE_ARGS[*]}"
vllm serve "${SERVE_ARGS[@]}" &
SERVER_PID=$!
# ---- Wait for server to be ready ----
echo "Waiting for vLLM server to start (timeout: 600s)..."
SECONDS_WAITED=0
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
if [ $SECONDS_WAITED -ge 600 ]; then
echo ""
echo "ERROR: vLLM server failed to start within 600s"
exit 1
fi
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
fi
sleep 2
SECONDS_WAITED=$((SECONDS_WAITED + 2))
done
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
# ---- Run BFCL evaluation ----
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
# against our local vLLM server.
bfcl_exit_code=0
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
import os
import sys
model = sys.argv[1]
test_category = sys.argv[2]
num_threads = int(sys.argv[3])
port = sys.argv[4]
api_type = sys.argv[5]
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
os.environ["OPENAI_API_KEY"] = "dummy"
os.environ["BFCL_PROJECT_ROOT"] = output_dir
import bfcl_eval.constants.model_config as bfcl_model_config
from bfcl_eval.constants.model_config import ModelConfig
from bfcl_eval.model_handler.api_inference.openai_completion import (
OpenAICompletionsHandler,
)
from bfcl_eval.model_handler.api_inference.openai_response import (
OpenAIResponsesHandler,
)
if api_type == "responses":
handler = OpenAIResponsesHandler
else:
handler = OpenAICompletionsHandler
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
model_name=model,
display_name=f"{model} (FC) (vLLM)",
url=f"https://huggingface.co/{model}",
org="",
license="apache-2.0",
model_handler=handler,
input_price=None,
output_price=None,
is_fc_model=True,
underscore_to_dot=True,
)
from bfcl_eval.__main__ import evaluate, generate
import inspect
import typer
def _get_default_kwargs(function):
kwargs = {}
for k, v in inspect.signature(function).parameters.items():
if v.default is not inspect.Parameter.empty:
default = v.default
if isinstance(default, typer.models.OptionInfo):
default = default.default
kwargs[k] = default
return kwargs
# ---- generate ----
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
gen_kwargs = _get_default_kwargs(generate)
gen_kwargs["model"] = [model]
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
gen_kwargs["skip_server_setup"] = True
gen_kwargs["num_threads"] = num_threads
generate(**gen_kwargs)
# ---- evaluate ----
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
eval_kwargs = _get_default_kwargs(evaluate)
eval_kwargs["model"] = [model]
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
evaluate(**eval_kwargs)
print("=== BFCL evaluation completed successfully ===")
PYEOF
# ---- Upload results to buildkite ----
if command -v buildkite-agent &>/dev/null; then
if [ $bfcl_exit_code -eq 0 ]; then
STYLE="success"
STATUS="PASSED"
else
STYLE="error"
STATUS="FAILED"
fi
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
EOF
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
# $BFCL_PROJECT_ROOT/score/
RESULTS_ROOT="${OUTPUT_DIR:-.}"
if [ -d "$RESULTS_ROOT/result" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
fi
if [ -d "$RESULTS_ROOT/score" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
fi
fi
exit $bfcl_exit_code

View File

@@ -9,11 +9,10 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables # For testing on local vm, use `set -a` to export all variables
source /etc/environment source /etc/environment
# shellcheck source=/dev/null source $ENV_FILE
source "$ENV_FILE"
remove_docker_container() { remove_docker_container() {
docker rm -f "$CONTAINER_NAME" || true; docker rm -f $CONTAINER_NAME || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
echo "starting docker...$CONTAINER_NAME" echo "starting docker...$CONTAINER_NAME"
echo echo
docker run \ docker run \
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file "$ENV_FILE" \ --env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \ -e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL="$MODEL" \ -e MODEL=$MODEL \
-e WORKSPACE=/workspace \ -e WORKSPACE=/workspace \
--name "$CONTAINER_NAME" \ --name $CONTAINER_NAME \
-d \ -d \
--privileged \ --privileged \
--network host \ --network host \

View File

@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG" echo "logging to $VLLM_LOG"
echo echo
vllm serve "$MODEL" \ vllm serve $MODEL \
--seed 42 \ --seed 42 \
--max-num-seqs "$MAX_NUM_SEQS" \ --max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--download_dir "$DOWNLOAD_DIR" \ --download_dir $DOWNLOAD_DIR \
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 & --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.." echo "wait for 20 minutes.."
echo echo
# sleep 1200 # sleep 1200
# wait for 10 minutes... # wait for 10 minutes...
for _ in {1..120}; do for i in {1..120}; do
# TODO: detect other type of errors. # TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting." echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo echo
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model "$MODEL" \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \ --dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len "$INPUT_LEN" \ --sonnet-input-len $INPUT_LEN \
--sonnet-output-len "$OUTPUT_LEN" \ --sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG" --ignore-eos > "$BM_LOG"
echo "completed..." echo "completed..."

View File

@@ -1,227 +0,0 @@
#!/bin/bash
#
# trigger-ci-build.sh
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
#
# Usage: ./trigger-ci-build.sh [options]
#
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
#
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default configuration
PIPELINE="ci"
DRY_RUN=true
usage() {
cat <<EOF
Usage: $(basename "$0") [options]
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
Options:
--execute Actually trigger the build (default: dry-run)
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
--commit Override commit SHA (default: current HEAD)
--branch Override branch name (default: current branch)
--message Custom build message (default: auto-generated)
--help Show this help message
Prerequisites:
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
- bk configured: bk configure
Examples:
$(basename "$0") # Dry-run, show what would happen
$(basename "$0") --execute # Actually trigger the build
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Parse arguments
COMMIT=""
BRANCH=""
MESSAGE=""
while [[ $# -gt 0 ]]; do
case $1 in
--execute)
DRY_RUN=false
shift
;;
--pipeline)
PIPELINE="$2"
shift 2
;;
--commit)
COMMIT="$2"
shift 2
;;
--branch)
BRANCH="$2"
shift 2
;;
--message)
MESSAGE="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
log_error "Unexpected argument: $1"
usage
;;
esac
done
# Check if bk CLI is installed
if ! command -v bk &>/dev/null; then
log_error "Buildkite CLI (bk) is not installed"
echo ""
echo "Install with:"
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
echo ""
echo "Then configure:"
echo " bk configure"
exit 1
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Get current commit and branch if not overridden
if [[ -z "$COMMIT" ]]; then
COMMIT=$(git rev-parse HEAD)
fi
if [[ -z "$BRANCH" ]]; then
BRANCH=$(git branch --show-current)
if [[ -z "$BRANCH" ]]; then
# Detached HEAD state - try to get branch from ref
BRANCH=$(git rev-parse --abbrev-ref HEAD)
fi
fi
# Generate default message if not provided
if [[ -z "$MESSAGE" ]]; then
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
MESSAGE="[Manual] ${COMMIT_MSG}"
fi
# Safety check: Verify the commit exists on the remote
log_info "Verifying commit exists on remote..."
git fetch origin --quiet 2>/dev/null || true
# Check if commit is reachable from any remote branch
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
if [[ -z "$REMOTE_BRANCHES" ]]; then
log_error "Commit ${COMMIT} does not exist on any remote branch!"
echo ""
echo "The CI system will fail to checkout this commit."
echo "Please push your changes first:"
echo ""
echo " git push origin ${BRANCH}"
echo ""
exit 1
fi
log_success "Commit found on remote branches:"
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
echo " ... and more"
fi
echo ""
log_info "Pipeline: ${PIPELINE}"
log_info "Branch: ${BRANCH}"
log_info "Commit: ${COMMIT}"
log_info "Message: ${MESSAGE}"
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
echo ""
# Build the command
CMD=(bk build create
-y
-w
-i
--pipeline "${PIPELINE}"
--commit "${COMMIT}"
--branch "${BRANCH}"
--message "${MESSAGE}"
--env "RUN_ALL=1"
--env "NIGHTLY=1"
)
if [[ "$DRY_RUN" == true ]]; then
echo "=========================================="
log_warn "DRY-RUN MODE - No build will be triggered"
echo "=========================================="
echo ""
echo "Command that would be executed:"
echo ""
# Escape single quotes in values for safe shell display
escape_for_shell() {
printf '%s' "$1" | sed "s/'/'\\\\''/g"
}
echo " bk build create \\"
echo " -y \\"
echo " -w \\"
echo " -i \\"
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
echo " --env 'RUN_ALL=1' \\"
echo " --env 'NIGHTLY=1'"
echo ""
echo "=========================================="
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
echo ""
echo " $0 --execute"
echo "=========================================="
exit 0
fi
log_info "Triggering build..."
# Execute the command - bk will print the URL and open browser
"${CMD[@]}"

View File

@@ -1,46 +0,0 @@
#!/usr/bin/env bash
set -ex
# Upload a single wheel to S3 (rename linux -> manylinux).
# Index generation is handled separately by generate-and-upload-nightly-index.sh.
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
# ========= collect, rename & upload the wheel ==========
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
# Check that exactly one wheel is found
if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
wheel="${wheel_files[0]}"
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# we also accept params as manylinux tag
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
manylinux_version="${1:-manylinux_2_31}"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
echo "Error: Wheel filename does not contain 'linux': $wheel"
exit 1
fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $version"
# copy wheel to its own bucket
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
echo "Wheel uploaded. Index generation is handled by a separate step."

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION"
if [[ -z "$GIT_VERSION" ]]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [[ -z "$PYPI_TOKEN" ]]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
set -x # avoid printing secrets above
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source distribution using setup.py
python setup.py sdist --dist-dir=$DIST_DIR
ls -la $DIST_DIR
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
echo "Found sdist: $SDIST_FILE"
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
echo "Wheels and source distribution uploaded to PyPI"

View File

@@ -1,151 +0,0 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Upload ROCm wheels to S3 with proper index generation
#
# Required environment variables:
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
# S3_BUCKET (default: vllm-wheels)
#
# S3 path structure:
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
set -ex
# ======== Configuration ========
BUCKET="${S3_BUCKET:-vllm-wheels}"
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
INDICES_OUTPUT_DIR="rocm-indices"
PYTHON="${PYTHON_PROG:-python3}"
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
MANYLINUX_VERSION="manylinux_2_35"
echo "========================================"
echo "ROCm Wheel Upload Configuration"
echo "========================================"
echo "S3 Bucket: $BUCKET"
echo "S3 Path: $ROCM_SUBPATH"
echo "Commit: $BUILDKITE_COMMIT"
echo "Branch: $BUILDKITE_BRANCH"
echo "========================================"
# ======== Part 0: Setup Python ========
# Detect if python3.12+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
if [[ "$has_new_python" -eq 0 ]]; then
# Use new python from docker
# Use --user to ensure files are created with correct ownership (not root)
docker pull python:3-slim
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ======== Part 1: Collect and prepare wheels ========
# Collect all wheels
mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
echo "ERROR: No wheels found to upload!"
exit 1
fi
# Rename linux to manylinux in wheel filenames
for wheel in all-rocm-wheels/*.whl; do
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
mv -- "$wheel" "$new_wheel"
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
fi
done
echo ""
echo "Wheels to upload:"
ls -lh all-rocm-wheels/
# ======== Part 2: Upload wheels to S3 ========
echo ""
echo "Uploading wheels to $S3_COMMIT_PREFIX"
for wheel in all-rocm-wheels/*.whl; do
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
done
# ======== Part 3: Generate and upload indices ========
# List existing wheels in commit directory
echo ""
echo "Generating indices..."
obj_json="rocm-objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# Use the existing generate-nightly-index.py
# HACK: Replace regex module with stdlib re (same as CUDA script)
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py \
--version "$ROCM_SUBPATH" \
--current-objects "$obj_json" \
--output-dir "$INDICES_OUTPUT_DIR" \
--comment "ROCm commit $BUILDKITE_COMMIT"
# Upload indices to commit directory
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# Update rocm/nightly/ if on main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Updating rocm/nightly/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"
PURE_VERSION="${VERSION%%+*}"
PURE_VERSION="${PURE_VERSION%%.rocm}"
echo "Pure version: $PURE_VERSION"
if [[ "$VERSION" != *"dev"* ]]; then
echo "Updating rocm/$PURE_VERSION/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
fi
fi
# ======== Part 4: Summary ========
echo ""
echo "========================================"
echo "ROCm Wheel Upload Complete!"
echo "========================================"
echo ""
echo "Wheels available at:"
echo " s3://$BUCKET/$ROCM_SUBPATH/"
echo ""
echo "Install command (by commit):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
echo ""
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Install command (nightly):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
fi
echo ""
echo "Wheel count: $WHEEL_COUNT"
echo "========================================"

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
set -ex
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
# Check that exactly one wheel is found
if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
# Get the single wheel file
wheel="${wheel_files[0]}"
# Detect architecture and rename 'linux' to appropriate manylinux version
arch=$(uname -m)
if [[ $arch == "x86_64" ]]; then
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
normal_wheel="$wheel" # Save the original wheel filename
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
if [[ $suffix == cu* ]]; then
new_version="1.0.0.dev+${suffix}"
else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi
# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,25 +0,0 @@
group: Attention
depends_on:
- image-build
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 attention (B200)
timeout_in_minutes: 30
device: b200
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention

View File

@@ -1,16 +0,0 @@
group: Basic Correctness
depends_on:
- image-build
steps:
- label: Basic Correctness
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py

View File

@@ -1,31 +0,0 @@
group: Benchmarks
depends_on:
- image-build
steps:
- label: Benchmarks
timeout_in_minutes: 20
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Attention Benchmarks Smoke Test (B200)
device: b200
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

View File

@@ -1,211 +0,0 @@
group: Compile
depends_on:
- image-build
steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
num_devices: 2
source_file_dependencies:
- vllm/model_executor/layers/
- vllm/compilation/
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- tests/compile/correctness_e2e/test_sequence_parallel.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: Sequence Parallel Correctness Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: AsyncTP Correctness Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: AsyncTP Correctness Tests (B200)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: b200
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/model_executor/layers
- tests/compile/passes/distributed/
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -s -v tests/compile/passes/distributed
- label: Fusion and Compile Unit Tests (2xB200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
- nvidia-smi
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
# TODO(luka) move to H100 once pass tests run on H100
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E Quick (H100)
timeout_in_minutes: 15
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
- label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label: Fusion E2E Config Sweep (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
num_devices: 1
optional: true
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
- label: Fusion E2E TP2 (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"

View File

@@ -1,23 +0,0 @@
group: CUDA
depends_on:
- image-build
steps:
- label: Platform Tests (CUDA)
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- pytest -v -s cuda/test_platform_no_cuda_init.py
- label: Cudagraph
timeout_in_minutes: 20
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py

View File

@@ -1,296 +0,0 @@
group: Distributed
depends_on:
- image-build
steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed DP Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/v1/distributed
- tests/entrypoints/openai/test_multi_api_servers.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
- label: Distributed Compile + RPC Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/entrypoints/llm/test_collective_rpc.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/distributed/
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py
- examples/offline_inference/rlhf_colocate.py
- examples/rl/
- tests/examples/offline_inference/data_parallel.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 examples/offline_inference/data_parallel.py --enforce-eager
# rlhf examples
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_utils
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- label: Distributed Compile + Comm (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- tests/distributed/test_symm_mem_allreduce.py
- tests/distributed/test_multiproc_executor.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node)
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
device: h100
num_devices: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
device: a100
optional: true
num_devices: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H100)
timeout_in_minutes: 15
device: h100
optional: true
working_dir: "/vllm-workspace/"
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
device: b200
optional: true
working_dir: "/vllm-workspace/"
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_nodes: 2
no_plugin: true
optional: true # TODO: revert once infra issue solved
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
timeout_in_minutes: 30
device: a100
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- vllm/v1/worker/kv_connector_model_runner_mixin.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py

View File

@@ -1,39 +0,0 @@
group: E2E Integration
depends_on:
- image-build
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
device: h100
optional: true
num_devices: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
device: h100
optional: true
num_devices: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
device: b200
optional: true
num_devices: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
timeout_in_minutes: 60
device: h100
optional: true
num_devices: 1
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030

View File

@@ -1,84 +0,0 @@
group: Engine
depends_on:
- image-build
steps:
- label: Engine
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/engine
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: Engine (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/engine/
- tests/v1/engine/
commands:
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 2
source_file_dependencies:
- vllm/
- tests/v1/e2e
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
depends_on:
- image-build-amd
- label: V1 e2e (4 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 4
source_file_dependencies:
- vllm/
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4
depends_on:
- image-build-amd
- label: V1 e2e (4xH100)
timeout_in_minutes: 60
device: h100
num_devices: 4
optional: true
source_file_dependencies:
- vllm/v1/attention/backends/utils.py
- vllm/v1/worker/gpu_model_runner.py
- tests/v1/e2e/test_hybrid_chunked_prefill.py
commands:
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py

View File

@@ -1,113 +0,0 @@
group: Entrypoints
depends_on:
- image-build
steps:
- label: Entrypoints Unit Tests
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server openai - Part 1)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server openai - Part 2)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
- pytest -v -s entrypoints/openai/speech_to_text/
- pytest -v -s entrypoints/test_chat_utils.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server openai - Part 3)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/rpc
- tests/entrypoints/serve/instrumentator
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/serve/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
- label: Entrypoints Integration (Responses API)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai/responses
commands:
- pytest -v -s entrypoints/openai/responses
- label: OpenAI API Correctness
timeout_in_minutes: 30
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/

View File

@@ -1,39 +0,0 @@
group: Expert Parallelism
depends_on:
- image-build
steps:
- label: EPLB Algorithm
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
- tests/distributed/test_eplb_utils.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- pytest -v -s distributed/test_eplb_utils.py
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
- label: Elastic EP Scaling Test
timeout_in_minutes: 20
device: h100
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/compilation/
- tests/distributed/
commands:
- pytest -v -s distributed/test_elastic_ep.py

View File

@@ -1,170 +0,0 @@
group: Kernels
depends_on:
- image-build
steps:
- label: Kernels Core Operation Test
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N
timeout_in_minutes: 90
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test %N
timeout_in_minutes: 25
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 5
- label: Kernels Mamba Test
timeout_in_minutes: 45
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
device: h100
num_devices: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
commands:
- nvidia-smi
- python3 examples/basic/offline_inference/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Kernels Helion Test
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
device: h100
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
device: h100
num_devices: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py

View File

@@ -1,118 +0,0 @@
group: LM Eval
depends_on:
- image-build
steps:
- label: LM Eval Small Models
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
# - label: LM Eval Large Models (4 GPUs)(A100)
# device: a100
# optional: true
# num_devices: 4
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
# source_file_dependencies:
# - csrc/
# - vllm/model_executor/layers/quantization
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
device: h100
optional: true
num_devices: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
device: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Qwen3.5 Models (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- vllm/model_executor/models/qwen3_5.py
- vllm/model_executor/models/qwen3_5_mtp.py
- vllm/transformers_utils/configs/qwen3_5.py
- vllm/transformers_utils/configs/qwen3_5_moe.py
- vllm/model_executor/models/qwen3_next.py
- vllm/model_executor/models/qwen3_next_mtp.py
- vllm/model_executor/layers/fla/ops/
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200
optional: true
num_devices: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
- label: GPQA Eval (GPT-OSS) (H100)
timeout_in_minutes: 120
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
- label: GPQA Eval (GPT-OSS) (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt

View File

@@ -1,34 +0,0 @@
group: LoRA
depends_on:
- image-build
steps:
- label: LoRA %N
timeout_in_minutes: 30
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py
parallelism: 4
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_devices: 4
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Alot of these tests are on the edge of OOMing
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
- pytest -v -s -x lora/test_qwen35_densemodel_lora.py

Some files were not shown because too many files have changed in this diff Show More