Compare commits

..

11 Commits

Author SHA1 Message Date
Shengqi Chen
b17039bccc [CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032)
(cherry picked from commit 8e61425ee6)
2026-01-16 21:04:48 -08:00
Cyrus Leung
48b67ba75f [Frontend] Standardize use of create_error_response (#32319)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-16 11:35:10 +00:00
TJian
09f4264a55 [Bugfix] Fix ROCm dockerfiles (#32447)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-16 10:50:00 +08:00
Matthew Bonanni
7f42dc20bb [CI] Fix LM Eval Large Models (H100) (#32423)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit bcf2333cd6)
2026-01-15 18:00:21 -08:00
TJian
c2a37a3cf8 Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-01-15 17:59:58 -08:00
Michael Goin
0e31fc7996 [UX] Use kv_offloading_backend=native by default (#32421)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit 1be5a73571)
2026-01-15 17:55:20 -08:00
Pleaplusone
6ac0fcf416 [ROCm][Bugfix] Disable hip sampler to fix deepseek's accuracy issue on ROCm (#32413)
Signed-off-by: ganyi <ygan@amd.com>
(cherry picked from commit 77c16df31d)
2026-01-15 17:55:06 -08:00
Douglas Lehr
b62249725c [ROCM] Add ROCm image build to release pipeline (#31995)
Signed-off-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
(cherry picked from commit c5891b5430)
2026-01-15 17:54:47 -08:00
vllmellm
1b57275207 [Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
(cherry picked from commit e27078ea80)
2026-01-15 17:54:01 -08:00
Martin Hickey
2c24bc6996 [BugFix] [KVConnector] Fix KV events for LMCache connector (#32169)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-13 10:56:23 -08:00
Cyrus Leung
0aa8c40552 [Bugfix] Replace PoolingParams.normalize with use_activation (#32243)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-13 10:56:23 -08:00
2513 changed files with 85704 additions and 277719 deletions

View File

@@ -1,8 +1,7 @@
name: vllm_ci
job_dirs:
- ".buildkite/image_build"
- ".buildkite/test_areas"
- ".buildkite/hardware_tests"
- ".buildkite/image_build"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"

View File

@@ -1,30 +0,0 @@
group: Hardware - AMD Build
steps:
- label: "AMD: :docker: build image"
key: image-build-amd
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1

View File

@@ -1,10 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Ascend NPU Test"
soft_fail: true
timeout_in_minutes: 20
no_plugin: true
device: ascend_npu
commands:
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh

View File

@@ -1,114 +0,0 @@
group: CPU
depends_on: []
steps:
- label: CPU-Kernel Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- cmake/cpu_extension.cmake
- CMakeLists.txt
- vllm/_custom_ops.py
- tests/kernels/attention/test_cpu_attn.py
- tests/kernels/moe/test_cpu_fused_moe.py
- tests/kernels/test_onednn.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/
- tests/models/language/generation/
- tests/models/language/pooling/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
- label: CPU-Quantization Model Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/model_executor/layers/quantization/cpu_wna16.py
- vllm/model_executor/layers/quantization/gptq_marlin.py
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
- tests/quantization/test_compressed_tensors.py
- tests/quantization/test_cpu_wna16.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
- label: CPU-Distributed Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/shm.cpp
- vllm/v1/worker/cpu_worker.py
- vllm/v1/worker/gpu_worker.py
- vllm/v1/worker/cpu_model_runner.py
- vllm/v1/worker/gpu_model_runner.py
- vllm/platforms/cpu.py
- vllm/distributed/parallel_state.py
- vllm/distributed/device_communicators/cpu_communicator.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
- label: CPU-Multi-Modal Model Tests %N
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
# - vllm/
- vllm/model_executor/layers/rotary_embedding
- tests/models/multimodal/generation/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
parallelism: 2
- label: "Arm CPU Test"
depends_on: []
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -1,10 +0,0 @@
group: Hardware
steps:
- label: "GH200 Test"
soft_fail: true
device: gh200
no_plugin: true
optional: true
commands:
- nvidia-smi
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh

View File

@@ -1,17 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Intel HPU Test"
soft_fail: true
device: intel_hpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- label: "Intel GPU Test"
depends_on: []
soft_fail: true
device: intel_gpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh

View File

@@ -1,255 +1,56 @@
#!/bin/bash
set -euo pipefail
set -e
# replace invalid characters in Docker image tags and truncate to 128 chars
clean_docker_tag() {
local input="$1"
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
exit 1
}
print_instance_info() {
echo ""
echo "=== Debug: Instance Information ==="
# Get IMDSv2 token
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
echo "AMI ID: ${AMI_ID}"
echo "Instance Type: ${INSTANCE_TYPE}"
echo "Instance ID: ${INSTANCE_ID}"
echo "AZ: ${AZ}"
else
echo "Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if [[ -f /etc/vllm-ami-info ]]; then
echo "Cache: warm (custom vLLM AMI)"
cat /etc/vllm-ami-info
else
echo "Cache: cold (standard AMI)"
fi
echo "==================================="
echo ""
}
setup_buildx_builder() {
echo "--- :buildkite: Setting up buildx builder"
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
echo "Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo "Creating baked-vllm-builder with remote driver"
docker buildx create \
--name baked-vllm-builder \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi
# builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
}
check_and_skip_if_image_exists() {
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi
}
ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags() {
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
cache="${MAIN_CACHE_ECR}:latest"
else
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
cache="${TEST_CACHE_ECR}:${clean_branch}"
fi
CACHE_TO="$cache"
CACHE_FROM="$cache"
CACHE_FROM_BASE_BRANCH="$cache"
else
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
else
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
fi
fi
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit() {
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi
}
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
}
#################################
# Main Script #
#################################
print_instance_info
if [[ $# -lt 5 ]]; then
print_usage_and_exit
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
fi
# input args
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
IMAGE_TAG=$5
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
# build config
TARGET="test-ci"
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
prepare_cache_tags
ecr_login
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# Environment info (for docs and human readers)
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (exported as environment variables for bake):
export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
# print args
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
# print build configuration
echo "--- :mag: Build configuration"
echo "TARGET: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "BUILDER_NAME: ${BUILDER_NAME}"
echo "CI_HCL_URL: ${CI_HCL_URL}"
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
echo "--- :mag: Cache tags"
echo "CACHE_TO: ${CACHE_TO}"
echo "CACHE_FROM: ${CACHE_FROM}"
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "CI HCL path: ${CI_HCL_PATH}"
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
echo "Make sure you're running from the vLLM repository root"
exit 1
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
echo "--- :arrow_down: Downloading ci.hcl"
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
echo "Downloaded to ${CI_HCL_PATH}"
if [[ ! -f "${CI_HCL_PATH}" ]]; then
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
exit 1
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
fi
setup_buildx_builder
resolve_parent_commit
export PARENT_COMMIT
print_bake_config
echo "--- :docker: Building ${TARGET}"
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
echo "--- :white_check_mark: Build complete"
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .

View File

@@ -3,9 +3,8 @@ steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
timeout_in_minutes: 600
commands:
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry:
automatic:
- exit_status: -1 # Agent was lost
@@ -41,7 +40,7 @@ steps:
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
@@ -24,11 +24,13 @@ fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_X86=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
@@ -24,10 +24,10 @@ fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
@@ -25,10 +25,10 @@ fi
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu

View File

@@ -1,15 +0,0 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.695
- name: "exact_match,flexible-extract"
value: 0.447
limit: 1319
num_fewshot: 5
max_model_len: 262144
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -1,19 +0,0 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.7142
- name: "exact_match,flexible-extract"
value: 0.4579
env_vars:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
limit: 1319
num_fewshot: 5
max_model_len: 262144
kv_cache_dtype: fp8
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -1,2 +1 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml

View File

@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml

View File

@@ -1,5 +0,0 @@
Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit "$LIMIT"
--limit $LIMIT

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
@@ -20,11 +20,14 @@ usage() {
echo
}
while getopts "m:l:f:t:" OPT; do
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;

View File

@@ -13,10 +13,9 @@ import os
from contextlib import contextmanager
import lm_eval
import numpy as np
import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08
@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
)
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success

View File

@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},

View File

@@ -7,10 +7,8 @@ import argparse
import html as _html
import json
import os
from contextlib import nullcontext
from dataclasses import dataclass
from importlib import util
from pathlib import Path
import pandas as pd
@@ -33,45 +31,6 @@ pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if "concurr" in str(c).lower():
s = df[c]
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
return c
raise ValueError(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def _normalize_concurrency_in_df(
df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
if canonical in df.columns:
return df
detected = _find_concurrency_col(df)
if detected in df.columns and detected != canonical:
return df.rename(columns={detected: canonical})
df[canonical] = pd.NA
return df
# -----------------------------
# Core data compare
# -----------------------------
@@ -91,25 +50,19 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can
group by columns.
- If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
"""
print("\ncompare_data_column:", data_column)
frames = []
raw_data_cols: list[str] = []
compare_frames = []
# Determine key cols after normalizing concurrency
cols_per_file: list[set] = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +73,12 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files."
)
union_index = None
metas: list[pd.DataFrame] = []
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
meta_added = False
for file in files:
df = pd.read_json(file, orient="records")
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc = str(data_column).lower()
is_latency_metric = (
"ttft" in metric_lc
or "tpot" in metric_lc
or "p99" in metric_lc
or "median" in metric_lc
or metric_lc.strip() in {"p99", "median"}
)
if is_latency_metric and drop_column in df.columns:
if drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
for c in (
@@ -163,61 +103,35 @@ def compare_data_columns(
meta = meta.groupby(level=key_cols, dropna=False).first()
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
if data_column in df_idx.columns:
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
else:
# keep NA series to preserve meta keys for union_index
s = pd.Series(pd.NA, index=meta.index)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label
name_s = None
if not meta_added:
frames.append(meta)
meta_added = True
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
if union_index is None:
union_index = meta.index
else:
union_index = union_index.union(meta.index)
metas.append(meta)
staged.append((file_label, s, name_s))
if union_index is None:
raise ValueError("No data found after loading inputs.")
# meta first (union-aligned): build UNION meta across all files
if metas:
meta_union = pd.concat(metas, axis=0)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
frames.append(meta_union.reindex(union_index))
# values + ratios (union-aligned)
metric_series_aligned: list[pd.Series] = []
for file_label, s, name_s in staged:
s_aligned = s.reindex(union_index)
frames.append(s_aligned)
frames.append(s)
raw_data_cols.append(file_label)
metric_series_aligned.append(s_aligned)
compare_frames.append(s)
if debug and name_s is not None:
frames.append(name_s.reindex(union_index))
if len(metric_series_aligned) >= 2:
base = metric_series_aligned[0]
current = metric_series_aligned[-1]
if "P99" in str(data_column) or "Median" in str(data_column):
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
if "P99" in data_column or "Median" in data_column:
ratio = base / current
else:
ratio = current / base
ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +202,24 @@ def split_json_by_tp_pp(
# -----------------------------
# Styling helpers
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold(
df: pd.DataFrame,
threshold: float,
slack_pct: float = 0.0,
df: pd.DataFrame, threshold: float
) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df)
key_cols = [
@@ -304,24 +232,12 @@ def _highlight_threshold(
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
slack_limit = threshold * (1.0 + slack_pct / 100.0)
def _cell(v):
if pd.isna(v):
return ""
if v <= threshold:
# Strict SLA
return "background-color:#e6ffe6;font-weight:bold;"
if v <= slack_limit:
# Within slack range
return "background-color:#ffe5cc;font-weight:bold;"
return ""
return df.style.map(_cell, subset=conf_cols)
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
else "",
subset=conf_cols,
)
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -359,177 +275,6 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name = "sheet" if name is None else str(name)
# Replace illegal characters with underscore.
trans = str.maketrans(
{
":": "_",
"\\": "_",
"/": "_",
"?": "_",
"*": "_",
"[": "_",
"]": "_",
}
)
name = name.translate(trans)
# Strip quotes/spaces and collapse whitespace.
name = name.strip().strip("'")
name = " ".join(name.split())
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
# Always keep input/output lengths (these are important).
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
# Shorten model name aggressively to make room for lens.
model = d.get("Model", "model")
leaf = str(model).split("/")[-1]
max_model_len = max(1, 31 - len(lens))
model_short = leaf[:max_model_len]
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if not blocks:
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
return
combined_parts: list[pd.DataFrame] = []
for title, df in blocks:
df2 = df.copy()
# Put the section label as the first column for readability.
df2.insert(0, "Section", title)
combined_parts.append(df2)
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
combined.to_excel(writer, sheet_name=sheet, index=False)
def _safe_filename(s: str) -> str:
# Fast path without the third-party `regex` module.
s = " ".join(str(s).strip().split())
allowed = []
for ch in s:
if ch.isalnum() or ch in "._-":
allowed.append(ch)
else:
allowed.append("_")
out = "".join(allowed)
return out[:180] if len(out) > 180 else out
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
@@ -556,11 +301,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def _max_concurrency_ok(
df: pd.DataFrame,
conc_col: str,
cfg_col: str,
threshold: float,
slack_pct: float = 0.0,
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
):
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
return pd.NA
@@ -573,14 +314,7 @@ def _max_concurrency_ok(
if d.empty:
return pd.NA
# Accept values up to (1 + slack_pct%) above the SLA.
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
ok = d[d[cfg_col] <= effective_limit]
ok = d[d[cfg_col] <= threshold]
if ok.empty:
return pd.NA
@@ -646,25 +380,15 @@ def build_valid_max_concurrency_summary_html(
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
@@ -693,8 +417,8 @@ def build_valid_max_concurrency_summary_html(
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
@@ -704,6 +428,7 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns:
if c == "Configuration":
continue
@@ -711,10 +436,12 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {}
for c in summary_df.columns:
if c == "Configuration":
continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters)
@@ -733,104 +460,6 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# -----------------------------
# Plot helper
# -----------------------------
@@ -908,35 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
# ---- SLA tolerance (slack) options ----
parser.add_argument(
"--ttft-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TTFT SLA (default: 5).",
)
parser.add_argument(
"--tpot-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TPOT SLA (default: 5).",
)
# ---- export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser
@@ -1015,13 +615,9 @@ def render_metric_table_html(
metric_name = metric_label.lower()
if "ttft" in metric_name:
styler = _highlight_threshold(
display_group, args.ttft_max_ms, args.ttft_slack_pct
)
styler = _highlight_threshold(display_group, args.ttft_max_ms)
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
styler = _highlight_threshold(
display_group, args.tpot_max_ms, args.tpot_slack_pct
)
styler = _highlight_threshold(display_group, args.tpot_max_ms)
else:
styler = display_group.style
@@ -1061,6 +657,7 @@ def maybe_write_plot(
markers=True,
)
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f")
@@ -1133,186 +730,87 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items()
}
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
if csv_dir:
csv_dir.mkdir(parents=True, exist_ok=True)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
excel_path = args.excel_out or "perf_comparison.xlsx"
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
main_fh.write(group_header)
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
excel_engine = (
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
)
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
excel_engine = "openpyxl"
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
excel_engine_kwargs = {}
if excel_engine == "xlsxwriter":
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs = {"options": {"constant_memory": True}}
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
xw_ctx = (
nullcontext(None)
if disable_excel
else pd.ExcelWriter(
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
)
)
with xw_ctx as xw:
used_sheets: set[str] = set()
# ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files)
if xw is not None:
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
used_sheets.add(env_sheet)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
main_fh.write(group_header)
do_excel = xw is not None
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
if do_excel:
dedup_i = 1
while sheet in used_sheets:
dedup_i += 1
suffix = f"_{dedup_i}"
# Ensure uniqueness even when sheet names are truncated.
base = str(sheet_base)
keep = max(1, 31 - len(suffix))
sheet = _sanitize_sheet_name(base[:keep] + suffix)
used_sheets.add(sheet)
excel_blocks: list[tuple[str, pd.DataFrame]] = []
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
excel_blocks.append(
(metric_label, group_df.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
summary_df = build_valid_max_concurrency_summary_df(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
if do_excel:
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
if disable_excel:
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
else:
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
def main():

View File

@@ -393,7 +393,7 @@ if __name__ == "__main__":
with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/performance-benchmarks/"
"performance-benchmarks-descriptions.md"
+ "performance-benchmarks-descriptions.md"
)
results = results.format(
latency_tests_markdown_table=latency_md_table,

View File

@@ -1,4 +1,6 @@
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
@@ -7,26 +9,14 @@
set -x
set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then
@@ -35,9 +25,9 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g arch_suffix=''
if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
@@ -54,7 +44,7 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo "$numa_count"
echo $numa_count
else
echo "Need at least 1 NUMA to run benchmarking."
exit 1
@@ -122,12 +112,13 @@ json2envs() {
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200"
timeout "$timeout_val" bash -c '
until curl -sf http://localhost:8000/v1/models >/dev/null; do
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done
'
done' && return 0 || return 1
}
kill_processes_launched_by_current_bash() {
@@ -190,318 +181,19 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
run_latency_tests() {
# run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
extract_metric_ms() {
local metric_name=$1
local json_file=$2
local latency_test_file
latency_test_file=$1
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
# $2: a json file specifying test cases
local test_type=$1
local test_file=$2
# Iterate over tests
jq -c '.[]' "$test_file" | while read -r params; do
# Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1
fi
@@ -512,15 +204,15 @@ run_benchmark_tests() {
fi
# get arguments
bench_params=$(echo "$params" | jq -r '.parameters')
bench_args=$(json2args "$bench_params")
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
bench_envs=$(json2envs "$bench_environment_variables")
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
latency_envs=$(json2envs "$latency_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -533,42 +225,118 @@ run_benchmark_tests() {
fi
fi
bench_command=" $bench_envs vllm bench $test_type \
latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \
$bench_args"
$latency_args"
echo "Running test case $test_name"
echo "${test_type^} command: $bench_command"
echo "Latency command: $latency_command"
# recording benchmarking command and GPU command
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$bench_command" \
--arg latency "$latency_command" \
--arg gpu "$gpu_type" \
--arg test_type "$test_type" \
'{
($test_type + "_command"): $command,
latency_command: $latency,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$bench_command"
eval "$latency_command"
kill_gpu_processes
done
}
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
run_throughput_tests() {
# run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
merge_serving_tests_stream() {
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1"
# shellcheck disable=SC2016
local merged='
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '
if type == "array" then
# Plain format: test cases array
.[]
@@ -590,50 +358,7 @@ merge_serving_tests_stream() {
else
error("Unsupported serving test file format: must be array or object with .tests")
end
'
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -652,48 +377,10 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +412,14 @@ run_serving_tests() {
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
server_command="$server_envs vllm serve $server_model \
server_command="$server_envs vllm serve \
$server_args"
# run the server
@@ -739,7 +427,7 @@ run_serving_tests() {
echo "Server command: $server_command"
# support remote vllm server
client_remote_args=""
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
@@ -750,9 +438,6 @@ run_serving_tests() {
echo ""
echo "vLLM failed to start within the timeout period."
fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then
@@ -762,48 +447,34 @@ run_serving_tests() {
fi
fi
# save the compilation mode and optimization level on the serving results
# whenever they are set
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args_effective $client_remote_args "
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
@@ -818,23 +489,15 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done
# clean up
if [[ "${DRY_RUN:-0}" != "1" ]]; then
kill -9 "$server_pid"
kill_gpu_processes
fi
kill -9 $server_pid
kill_gpu_processes
done
}
main() {
local ARCH
ARCH=''
if [[ "$ON_CPU" == "1" ]]; then
@@ -844,13 +507,7 @@ main() {
check_gpus
ARCH="$arch_suffix"
fi
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
check_hf_token
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -871,24 +528,17 @@ main() {
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite
}

View File

@@ -51,56 +51,5 @@
"max-model-len": 256,
"async-scheduling": ""
}
},
{
"test_name": "latency_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"max-model-len": 2048,
"dtype": "bfloat16"
}
},
{
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"max-model-len": 512,
"max-num-seqs": 128,
"async-scheduling": "",
"gpu-memory-utilization": 0.95,
"enable_expert_parallel": ""
}
},
{
"test_name": "latency_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1,
"max-model-len": 2048,
"max-num-seqs": 128,
"dtype": "bfloat16",
"async-scheduling": ""
}
}
]

View File

@@ -1,37 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,41 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,355 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -72,6 +72,17 @@
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
@@ -94,6 +105,17 @@
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
@@ -117,25 +139,144 @@
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]

View File

@@ -10,6 +10,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -36,6 +37,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -62,6 +64,7 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -75,83 +78,5 @@
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_deepseek_r1",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 200,
"async-scheduling": "",
"dtype": "bfloat16"
},
"client_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"max-model-len": 2048,
"max-num-seqs": 128,
"async-scheduling": "",
"enable_expert_parallel": "",
"max-num-batched-tokens": 4096
},
"client_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_qwen3_8b",
"qps_list": [1, 4, 10, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"disable_log_stats": "",
"async-scheduling": ""
},
"client_parameters": {
"model": "Qwen/Qwen-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
}
]

View File

@@ -5,6 +5,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -22,6 +23,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -39,6 +41,7 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -56,6 +59,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,

View File

@@ -57,67 +57,5 @@
"max-num-seqs": 512,
"async-scheduling": ""
}
},
{
"test_name": "throughput_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 384,
"async-scheduling": ""
}
},
{
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": "",
"enable_expert_parallel": ""
}
},
{
"test_name": "throughput_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"max-num-seqs": 512,
"backend": "vllm",
"async-scheduling": ""
}
}
]

View File

@@ -1,270 +1,277 @@
steps:
# aarch64 + CUDA builds
- label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 CPU wheel build
- label: "Build wheel - x86_64 - CPU"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# Build release images (CUDA 12.9)
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- block: "Build CUDA 13.0 release images"
key: block-release-image-build-cuda-13-0
depends_on: ~
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: release-version
- group: "Build Python wheels"
key: "build-wheels"
steps:
- label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheel-x86-cuda-12-9
- build-wheel-x86-cuda-13-0
- build-wheel-x86-cpu
- build-wheel-arm64-cuda-12-9
- build-wheel-arm64-cuda-13-0
- build-wheel-arm64-cpu
- label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Upload release wheels to PyPI and GitHub"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish CPU release image"
depends_on: block-cpu-release-image-build
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- block: "Build arm64 CPU release image"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build wheel - x86_64 - CPU"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish arm64 CPU release image"
depends_on: block-arm64-cpu-release-image-build
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- group: "Build release Docker images"
key: "build-release-images"
steps:
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- block: "Build ROCm release image"
key: block-rocm-release-image-build
depends_on: ~
- label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
- label: "Build release image (ROCm)"
depends_on: block-rocm-release-image-build
id: build-release-image-rocm
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# Build base image first
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
# Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: ~
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: ~
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build release image - x86_64 - CPU"
depends_on:
- block-cpu-release-image-build
- input-release-version
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image for arm64 CPU"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build release image - arm64 - CPU"
depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- group: "Publish release images"
key: "publish-release-images"
steps:
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- group: "Publish wheels"
key: "publish-wheels"
steps:
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheels
- label: "Upload release wheels to PyPI"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
@@ -459,7 +466,7 @@ steps:
S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel
- label: ":python: Build vLLM ROCm Wheel - x86_64"
- label: ":python: Build vLLM ROCm Wheel"
id: build-rocm-vllm-wheel
depends_on:
- step: build-rocm-base-wheels
@@ -621,93 +628,9 @@ steps:
depends_on:
- step: upload-rocm-wheels
allow_failure: true
- step: input-release-version
allow_failure: true
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
S3_BUCKET: "vllm-wheels"
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
- block: "Generate Root Index for ROCm Wheels for Release"
key: block-generate-root-index-rocm-wheels
depends_on: upload-rocm-wheels
- label: ":package: Generate Root Index for ROCm Wheels for Release"
depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels
agents:
queue: cpu_queue_postmerge
commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
VARIANT: "rocm700"
# ROCm Job 5: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
id: build-rocm-release-image
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
- |
set -euo pipefail
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Download Docker image from S3 (set by build-rocm-base-wheels)
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
echo "ERROR: rocm-docker-image-s3-path metadata not found"
exit 1
fi
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
mkdir -p artifacts/rocm-docker-image
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
# Load base Docker image
echo "Loading base Docker image..."
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
echo "Loaded base image: $${BASE_IMAGE_TAG}"
# Tag and push the base image to ECR
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
# Get GPU architectures from meta-data
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
# Build vLLM ROCm release image using cached base
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"

View File

@@ -11,36 +11,28 @@ fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download the wheel (by version):
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
# Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
# Tag and push images:
## CUDA
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -48,70 +40,22 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
## ROCm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
## CPU
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai-cpu:latest-x86_64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
# Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai:latest-rocm
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest push vllm/vllm-openai-cpu:latest
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
\`\`\`
EOF
EOF

View File

@@ -3,32 +3,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Generate Buildkite annotation for ROCm wheel release
set -ex
# Get build configuration from meta-data
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# TODO: Enable the nightly build for ROCm
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev"
fi
# S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases
## :rocm: ROCm Wheel Release
### Build Configuration
| Setting | Value |
|---------|-------|
@@ -41,72 +34,41 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
### :package: Installation
**Install from this build (by commit):**
\`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
# Example for ROCm ${ROCM_VERSION}:
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
# Example:
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
\`\`\`
**Install from nightly (if published):**
\`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
\`\`\`
### :floppy_disk: Download Wheels Directly
\`\`\`bash
# List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
# Download specific wheels
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
\`\`\`
### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton
- **triton-kernels**: Triton kernels
- **triton_rocm**: Triton built for ROCm
- **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
### :package: Docker Image Release
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@@ -83,7 +83,7 @@ case "${1:-}" in
exit 1
fi
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
echo ""
echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
ls -lh artifacts/rocm-base-wheels/
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
echo "========================================"

View File

@@ -1,213 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Check if Ray LLM can generate lock files that are compatible with this
# version of vllm. Downloads Ray's requirement files and runs a full
# dependency resolution with the installed vllm's constraints to see if
# a valid lock file can be produced.
#
# See: https://github.com/vllm-project/vllm/issues/33599
set -eo pipefail
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR=$(mktemp -d)
trap 'rm -rf "$WORK_DIR"' EXIT
# Fetch all Ray requirement files used in the LLM depset pipeline
echo ">>> Fetching Ray requirement files"
RAY_FILES=(
"requirements.txt"
"requirements/cloud-requirements.txt"
"requirements/base-test-requirements.txt"
"requirements/llm/llm-requirements.txt"
"requirements/llm/llm-test-requirements.txt"
)
for FILE in "${RAY_FILES[@]}"; do
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
echo " ${FILE}"
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
done
# Extract installed vllm deps
echo ">>> Extracting installed vllm dependency constraints"
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
"""Write out the installed vllm's dependencies as pip constraint lines.
Ray uses vllm[audio], so audio-extra deps are included with their extra
markers stripped. The resolver cannot evaluate extra markers for a
package that is not itself being resolved from an index, so we activate
them manually here.
"""
import importlib.metadata
import re
import sys
out_path = sys.argv[1]
raw_reqs = importlib.metadata.requires("vllm") or []
# Ray uses vllm[audio] activate that extra.
ACTIVE_EXTRAS = {"audio"}
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
lines = []
for r in raw_reqs:
if ";" not in r:
# Unconditional dep — always include.
lines.append(r.strip())
continue
req_part, _, marker_part = r.partition(";")
marker_part = marker_part.strip()
extra_matches = EXTRA_RE.findall(marker_part)
if not extra_matches:
# Non-extra marker (python_version, etc.) — keep as-is.
lines.append(r.strip())
continue
if not ACTIVE_EXTRAS.intersection(extra_matches):
continue # Skip inactive extras (tensorizer, bench, …).
# Strip the extra== conditions but keep any remaining markers
# (e.g. python_version).
cleaned = EXTRA_RE.sub("", marker_part)
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
if cleaned:
lines.append(f"{req_part.strip()} ; {cleaned}")
else:
lines.append(req_part.strip())
with open(out_path, "w") as f:
for line in lines:
f.write(line + "\n")
print(f"Wrote {len(lines)} constraints to {out_path}")
PYEOF
echo ">>> Installed vllm deps (first 20 lines):"
head -20 "${WORK_DIR}/vllm-constraints.txt"
# Remove Ray's vllm pin — the installed vllm's transitive deps
# (written above) replace it in the resolution. vllm itself cannot
# be resolved from PyPI for in-development versions, so we test
# whether Ray's requirements can coexist with vllm's dependency
# constraints instead.
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
# Install uv if needed
if ! command -v uv &>/dev/null; then
echo ">>> Installing uv"
pip install uv -q
fi
# Resolve: given vllm's constraints, can Ray compile a lock file?
#
# vllm's dependency constraints are the fixed side — Ray is flexible and
# can regenerate its lock files. We pass vllm's constraints via -c so
# the resolver treats them as non-negotiable bounds, then check whether
# Ray's own requirements can still be satisfied within those bounds.
echo ""
echo "============================================================"
echo ">>> Resolving: Can Ray generate compatible lock files?"
echo "============================================================"
set +e
uv pip compile \
"${WORK_DIR}/requirements.txt" \
"${WORK_DIR}/cloud-requirements.txt" \
"${WORK_DIR}/base-test-requirements.txt" \
"${WORK_DIR}/llm-requirements.txt" \
"${WORK_DIR}/llm-test-requirements.txt" \
-c "${WORK_DIR}/vllm-constraints.txt" \
--python-version 3.12 \
--python-platform x86_64-manylinux_2_31 \
--extra-index-url https://download.pytorch.org/whl/cu129 \
--index-strategy unsafe-best-match \
--unsafe-package setuptools \
--unsafe-package ray \
--no-header \
-o "${WORK_DIR}/resolved.txt" \
2>&1
EXIT_CODE=$?
set -e
echo ""
echo "=========================================="
if [ $EXIT_CODE -eq 0 ]; then
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
echo ""
echo "Key resolved versions:"
grep -E '^(protobuf|torch|numpy|transformers)==' \
"${WORK_DIR}/resolved.txt" | sort || true
echo "=========================================="
exit 0
fi
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
echo "This means a fundamental dependency conflict exists that Ray"
echo "cannot resolve by regenerating its lock files."
echo "See: https://github.com/vllm-project/vllm/issues/33599"
echo "=========================================="
# Buildkite annotation
if [ -f /usr/bin/buildkite-agent ]; then
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
### :warning: Ray Dependency Compatibility Warning
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
Ray would not be able to regenerate its lock files to accommodate this vllm version.
Please check the **Ray Dependency Compatibility Check** step logs for details.
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
EOF
fi
# Notify Slack if webhook is configured and PR/branch are valid.
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
PR="${BUILDKITE_PULL_REQUEST:-}"
BRANCH="${BUILDKITE_BRANCH:-}"
# Skip notification if PR is invalid or branch is empty
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
else
echo ">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD=$(python3 -c '
import json, os, sys
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
url = os.getenv("BUILDKITE_BUILD_URL", "#")
data = {
"text": ":warning: Ray Dependency Compatibility Check Failed",
"blocks": [{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
"*:warning: Ray Dependency Compatibility Check Failed*\n"
f"PR #{pr} on branch `{branch}` introduces dependencies "
f"that cannot be resolved with Ray'\''s requirements.\n"
f"<{url}|View Build>"
),
},
}],
}
print(json.dumps(data))
')
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-H 'Content-type: application/json' \
-d "$PAYLOAD")
echo " Slack webhook response: $HTTP_CODE"
fi
else
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
fi
exit 1

View File

@@ -1,242 +0,0 @@
#!/bin/bash
#
# cherry-pick-from-milestone.sh
# Find commits from a GitHub milestone that are missing from the current branch
# and output them in chronological order for cherry-picking.
#
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
usage() {
cat <<EOF
Usage: $(basename "$0") <milestone> [options]
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
Arguments:
milestone The GitHub milestone name (e.g., v0.14.0)
Options:
--dry-run Show the cherry-pick commands without executing (default)
--execute Actually execute the cherry-picks
--main-branch Specify the main branch name (default: main)
--help Show this help message
Examples:
$(basename "$0") v0.14.0
$(basename "$0") v0.14.0 --dry-run
$(basename "$0") v0.14.0 --execute
$(basename "$0") v0.14.0 --main-branch master
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Default values
MILESTONE=""
DRY_RUN=true
MAIN_BRANCH="main"
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
shift
;;
--execute)
DRY_RUN=false
shift
;;
--main-branch)
MAIN_BRANCH="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
if [[ -z "$MILESTONE" ]]; then
MILESTONE="$1"
else
log_error "Unexpected argument: $1"
usage
fi
shift
;;
esac
done
# Validate milestone argument
if [[ -z "$MILESTONE" ]]; then
log_error "Milestone is required"
usage
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Check if gh CLI is available
if ! command -v gh &>/dev/null; then
log_error "GitHub CLI (gh) is not installed"
exit 1
fi
# Check if authenticated with gh
if ! gh auth status &>/dev/null; then
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
exit 1
fi
CURRENT_BRANCH=$(git branch --show-current)
log_info "Current branch: ${CURRENT_BRANCH}"
log_info "Main branch: ${MAIN_BRANCH}"
log_info "Milestone: ${MILESTONE}"
echo ""
# Fetch latest from remote
log_info "Fetching latest from remote..."
git fetch origin "$MAIN_BRANCH" --quiet
# Get merged PRs from the milestone, sorted by merge date
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file
PR_DATA=$(mktemp)
trap 'rm -f "$PR_DATA"' EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \
--json number,title,mergeCommit,mergedAt \
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
log_error "This could be due to:"
log_error " - Milestone does not exist"
log_error " - Network/authentication issues"
log_error " - Invalid milestone name format"
exit 1
fi
if [[ ! -s "$PR_DATA" ]]; then
log_warn "No merged PRs found for milestone '${MILESTONE}'"
exit 0
fi
TOTAL_PRS=$(wc -l < "$PR_DATA")
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
echo ""
# Find commits that are missing from current branch
MISSING_COMMITS=()
MISSING_INFO=()
while IFS=$'\t' read -r sha pr_number title; do
# Skip if SHA is empty or null
if [[ -z "$sha" || "$sha" == "null" ]]; then
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
continue
fi
# Check if this commit is already in the current branch
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
log_success "PR #${pr_number} already in branch: ${title:0:60}"
else
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
MISSING_COMMITS+=("$sha")
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
fi
done < "$PR_DATA"
echo ""
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
exit 0
fi
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
echo ""
# Output the cherry-pick commands
echo "=========================================="
echo "Cherry-pick commands (in chronological order):"
echo "=========================================="
echo ""
for info in "${MISSING_INFO[@]}"; do
echo "# $info"
done
echo ""
echo "# Run these commands to cherry-pick all missing commits:"
echo "git cherry-pick ${MISSING_COMMITS[*]}"
echo ""
# Or one by one
echo "# Or cherry-pick one at a time:"
for sha in "${MISSING_COMMITS[@]}"; do
echo "git cherry-pick $sha"
done
echo ""
# Execute if requested
if [[ "$DRY_RUN" == false ]]; then
echo "=========================================="
log_info "Executing cherry-picks..."
echo "=========================================="
for i in "${!MISSING_COMMITS[@]}"; do
sha="${MISSING_COMMITS[$i]}"
info="${MISSING_INFO[$i]}"
echo ""
log_info "Cherry-picking: $info"
if git cherry-pick "$sha"; then
log_success "Successfully cherry-picked $sha"
else
log_error "Failed to cherry-pick $sha"
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
exit 1
fi
done
echo ""
log_success "All cherry-picks completed successfully!"
else
echo "=========================================="
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
echo "Run with --execute to perform the cherry-picks"
echo "=========================================="
fi

View File

@@ -3,14 +3,7 @@
set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Get tag prefix from argument, default to "nightly-" if not provided
TAG_PREFIX="${1:-nightly-}"
echo "Cleaning up tags with prefix: $TAG_PREFIX"
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -52,7 +45,7 @@ get_all_tags() {
set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then
break

View File

@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant subdirectory.
Generate project list HTML content linking to each project & variant sub-directory.
"""
href_tags = []
for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant subdirectory
cpu/ # cpu variant sub-directory
index.html
vllm/
index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
vllm/
index.html
metadata.json
cu130/ # cu130 variant subdirectory
cu130/ # cu130 variant sub-directory
index.html
vllm/
index.html

View File

@@ -1,57 +1,25 @@
#!/bin/bash
# This script runs tests inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
#
###############################################################################
# QUOTING / COMMAND PASSING
#
# Passing commands as positional arguments ($*) is fragile when the command
# string itself contains double quotes, e.g.:
#
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
#
# The outer shell resolves the nested quotes *before* this script runs, so
# the script receives mangled input it cannot fully recover.
#
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
#
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
# bash run-amd-test.sh
#
# Single-quoted assignment preserves all inner double quotes verbatim.
# The $* path is kept for backward compatibility but callers should migrate.
###############################################################################
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
# Export Python path
export PYTHONPATH=".."
###############################################################################
# Helper Functions
###############################################################################
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
wait_for_clean_gpus() {
local timeout=${1:-300}
local start=$SECONDS
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
return
fi
if (( SECONDS - start >= timeout )); then
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
exit 1
fi
sleep 3
done
}
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
@@ -73,446 +44,201 @@ cleanup_docker() {
fi
}
cleanup_network() {
local max_nodes=${NUM_NODES:-2}
for node in $(seq 0 $((max_nodes - 1))); do
if docker ps -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}" || true
fi
done
if docker network ls | grep -q docker-net; then
docker network rm docker-net || true
fi
}
is_multi_node() {
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
return 0
fi
return 1
}
handle_pytest_exit() {
local exit_code=$1
if [ "$exit_code" -eq 5 ]; then
echo "Pytest exit code 5 (no tests collected) - treating as success."
exit 0
fi
exit "$exit_code"
}
###############################################################################
# Pytest marker/keyword re-quoting
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
# quotes around multi-word pytest -m/-k expressions get stripped:
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
#
# This function detects unquoted expressions after -m/-k and re-quotes them
# by collecting tokens until a recognizable boundary is reached:
# - test path (contains '/')
# - test file (ends with '.py')
# - another pytest flag (--xxx or -x single-char flags)
# - command separator (&& || ; |)
# - environment variable assignment (FOO=bar)
#
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
# unquoted since they have no spaces and work fine.
#
# Already-quoted expressions (containing literal single quotes) are passed
# through untouched to avoid double-quoting values injected by
# apply_rocm_test_overrides.
#
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
# double-quotes stripped by the calling shell (see header comment).
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
###############################################################################
re_quote_pytest_markers() {
local input="$1"
local output=""
local collecting=false
local marker_buf=""
# Strip backslash-newline continuations, then flatten remaining newlines
local flat="${input//$'\\\n'/ }"
flat="${flat//$'\n'/ }"
# Disable globbing to prevent *.py etc. from expanding during read -ra
local restore_glob
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
set -o noglob
local -a words
read -ra words <<< "$flat"
eval "$restore_glob"
for word in "${words[@]}"; do
if $collecting; then
# If the token we're about to collect already contains a literal
# single quote, the expression was already quoted upstream.
# Flush and stop collecting.
if [[ "$word" == *"'"* ]]; then
if [[ -n "$marker_buf" ]]; then
# Should not normally happen (partial buf + quote), flush raw
output+="${marker_buf} "
marker_buf=""
fi
output+="${word} "
collecting=false
continue
fi
local is_boundary=false
case "$word" in
# Line-continuation artifact
"\\")
is_boundary=true ;;
# Command separators
"&&"|"||"|";"|"|")
is_boundary=true ;;
# Long flags (--ignore, --shard-id, etc.)
--*)
is_boundary=true ;;
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
# like "not" which don't start with "-". Also skip -k/-m which
# would start a new marker (handled below).
-[a-zA-Z])
is_boundary=true ;;
# Test path (contains /)
*/*)
is_boundary=true ;;
# Test file (ends with .py, possibly with ::method)
*.py|*.py::*)
is_boundary=true ;;
# Environment variable assignment preceding a command (FOO=bar)
*=*)
# Only treat as boundary if it looks like VAR=value, not
# pytest filter expressions like num_gpus=2 inside markers
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
is_boundary=true
fi
;;
esac
if $is_boundary; then
# Strip surrounding double quotes if present (from upstream
# single-to-double conversion); without this, wrapping below
# would produce '"expr"' with literal double-quote characters.
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
# Flush the collected marker expression
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}' "
else
output+="${marker_buf} "
fi
collecting=false
marker_buf=""
# Check if this boundary word itself starts a new -m/-k
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
# Drop stray backslash tokens silently
elif [[ "$word" == "\\" ]]; then
:
else
output+="${word} "
fi
else
# Accumulate into marker buffer
if [[ -n "$marker_buf" ]]; then
marker_buf+=" ${word}"
else
marker_buf="${word}"
fi
fi
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
marker_buf=""
else
output+="${word} "
fi
done
# Flush any trailing marker expression (marker at end of command)
if $collecting && [[ -n "$marker_buf" ]]; then
# Strip surrounding double quotes (see mid-stream flush comment)
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}'"
else
output+="${marker_buf}"
fi
fi
echo "${output% }"
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides() {
local cmds="$1"
# --- Model registry filter ---
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
# --- LoRA: disable custom paged attention ---
if [[ $cmds == *"pytest -v -s lora"* ]]; then
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
# --- Kernel ignores ---
if [[ $cmds == *" kernels/core"* ]]; then
cmds="${cmds} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $cmds == *" kernels/attention"* ]]; then
cmds="${cmds} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $cmds == *" kernels/quantization"* ]]; then
cmds="${cmds} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $cmds == *" kernels/mamba"* ]]; then
cmds="${cmds} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $cmds == *" kernels/moe"* ]]; then
cmds="${cmds} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/chat_completion/test_audio.py \
--ignore=entrypoints/openai/completion/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# Clean up escaped newlines from --ignore appends
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
echo "$cmds"
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
wait_for_clean_gpus
# --- Pull test image ---
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
# --- Prepare commands ---
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
# ---- Command source selection ----
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
# Fall back to $* for backward compatibility, but warn that inner
# double-quotes will have been stripped by the calling shell.
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
commands="${VLLM_TEST_COMMANDS}"
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
else
commands="$*"
if [[ -z "$commands" ]]; then
echo "Error: No test commands provided." >&2
echo "Usage:" >&2
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
echo " Legacy: bash $0 \"commands here\"" >&2
exit 1
fi
echo "Commands sourced from positional args (legacy mode)"
echo "WARNING: Inner double-quotes in the command string may have been"
echo " stripped by the calling shell. If you see syntax errors, switch to:"
echo " export VLLM_TEST_COMMANDS='your commands here'"
echo " bash $0"
commands=$@
echo "Commands:$commands"
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
echo "Raw commands: $commands"
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
# Fix quoting before ROCm overrides (so overrides see correct structure)
commands=$(re_quote_pytest_markers "$commands")
echo "After re-quoting: $commands"
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
commands=$(apply_rocm_test_overrides "$commands")
echo "Final commands: $commands"
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Verify GPU access
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# --- RDMA device passthrough (conditional) ---
# If the host has RDMA devices, pass them through so tests like
# test_moriio_connector can access ibverbs. On hosts without RDMA
# hardware the tests will gracefully skip via _rdma_available().
RDMA_FLAGS=""
if [ -d /dev/infiniband ]; then
echo "RDMA devices detected on host, enabling passthrough"
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
else
echo "No RDMA devices found on host, RDMA tests will be skipped"
fi
# --- Route: multi-node vs single-node ---
if is_multi_node "$commands"; then
echo "--- Multi-node job detected"
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
# BASH_REMATCH[1] = prefix (everything before first bracket)
# BASH_REMATCH[2] = comma-separated node0 commands
# BASH_REMATCH[3] = comma-separated node1 commands
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
saved_IFS=$IFS
IFS=','
read -ra node0 <<< "${BASH_REMATCH[2]}"
read -ra node1 <<< "${BASH_REMATCH[3]}"
IFS=$saved_IFS
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in "${PIDS[@]}"; do
wait "${pid}"
STATUS+=($?)
done
at_least_one_shard_with_tests=0
for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
elif [[ ${st} -eq 5 ]]; then
echo "Shard exited with status 5 (no tests collected) - treating as success"
else # This means st is 0
at_least_one_shard_with_tests=1
fi
for i in "${!node0[@]}"; do
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${step_cmd}"
composite_command="${composite_command} && ${step_cmd}"
done
/bin/bash -c "${composite_command}"
exit_code=$?
cleanup_network
handle_pytest_exit "$exit_code"
else
echo "Multi-node job detected but failed to parse bracket command syntax."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
echo "Got: $commands"
cleanup_network
exit 111
done
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
echo "All shards reported no tests collected. Failing the build."
exit 1
fi
else
echo "--- Single-node job"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
$RDMA_FLAGS \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e BUILDKITE_PARALLEL_JOB \
-e BUILDKITE_PARALLEL_JOB_COUNT \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
exit_code=$?
handle_pytest_exit "$exit_code"
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
fi

View File

@@ -1,65 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_KVCACHE_SPACE=1
export VLLM_CPU_CI_ENV=1
# Reduce sub-processes for acceleration
export TORCH_COMPILE_DISABLE=1
export VLLM_ENABLE_V1_MULTIPROCESSING=0
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
mkdir -p sde
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
wait_for_pid_and_check_log() {
local pid="$1"
local log_file="$2"
local exit_status
if [ -z "$pid" ] || [ -z "$log_file" ]; then
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return 1
fi
echo "Waiting for process $pid to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if wait "$pid"; then
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Success)."
else
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Failure)."
fi
if [ "$exit_status" -ne 0 ]; then
echo "Process exited with a non-zero status."
echo "--- Last few lines of log file: $log_file ---"
tail -n 50 "$log_file"
echo "---------------------------------------------"
return 1 # Indicate failure based on exit status
fi
echo "No errors detected in log file and process exited successfully."
return 0
}
# Test Sky Lake (AVX512F)
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
PID_TEST_0=$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
PID_TEST_1=$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
PID_TEST_2=$!
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
wait_for_pid_and_check_log $PID_TEST_2 test_2.log

View File

@@ -1,43 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_CI_ENV=0
echo "--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--result-dir ./test_results \
--result-filename tp_pp.json \
--save-result \
--endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi
echo "--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--result-dir ./test_results \
--result-filename dp_pp.json \
--save-result \
--endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi

View File

@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test bash -c "
set -e
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -2,19 +2,119 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -euox pipefail
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
IMAGE_NAME="cpu-test-$NUMA_NODE"
TIMEOUT_VAL=$1
TEST_COMMAND=$2
# building the docker image
echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Run AWQ/GPTQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_cpu_wna16.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/lora/test_qwenvl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
'

View File

@@ -1,49 +1,21 @@
#!/bin/bash
# This script builds the HPU docker image and runs the offline inference inside the container.
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
#
# vllm-gaudi compatibility pinning:
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
# When upstream vllm changes its API, the plugin may break before it has been updated.
# To handle this, the vllm-gaudi repository maintains a file:
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
# The first line of that file controls what version of vllm is used inside the Docker image:
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
# the test to a known-compatible baseline.
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
set -exuo pipefail
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
VLLM_COMMUNITY_COMMIT=$(curl -s \
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
| head -1 | tr -d '\n')
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
# Try building the docker image
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t "${image_name}" -f - .
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
# the current checkout (the Buildkite CI commit) is used unchanged.
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
fi
WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
# install development dependencies (for testing)
@@ -64,20 +36,15 @@ EOF
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_containers() { docker rm -f "${container_name}" || true; }
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name="${container_name}" --network=host \
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
-e HABANA_VISIBLE_DEVICES=all \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-e PT_HPU_LAZY_MODE=1 \
"${image_name}" \
/bin/bash -c '
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
'
hpu-plugin-v1-test-env \
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
EXITCODE=$?
if [ $EXITCODE -eq 0 ]; then

View File

@@ -41,7 +41,6 @@ get_config() {
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1
fi
# shellcheck source=/dev/null
source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0
@@ -49,8 +48,9 @@ get_config() {
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if ! get_config; then
if [ $? -ne 0 ]; then
exit 1
fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p "${builder_cache_dir}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
--cache-to type=local,dest="${builder_cache_dir}",mode=max \
--progress=plain --load -t "${image_name}" -f - .
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns one argument per line: --device, /dev/davinciX, ...
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
printf '%s\n' "--device"
printf '%s\n' "/dev/davinci${dev_idx}"
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p "${model_cache_dir}"
mkdir -p ${model_cache_dir}
docker run \
"${device_args[@]}" \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v "${model_cache_dir}":/root/.cache/modelscope \
-v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"

View File

@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t "${image_name}" -f docker/Dockerfile.xpu .
docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {
@@ -34,22 +34,19 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest push vllm/vllm-openai:"$TAG_NAME"
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"

View File

@@ -0,0 +1,64 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@@ -43,6 +43,7 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
@@ -51,14 +52,13 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \
--trust-remote-code \
--max-model-len 2048 \
--all2all-backend "$BACK" \
--port "$PORT" &
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,57 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-Lite"
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
vllm serve "$MODEL" \
--max-model-len 2048 \
--offload-group-size 8 \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
PY
cleanup
SERVER_PID=

View File

@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
vllm serve "$MODEL" \
--enforce-eager \
--enable-eplb \
--all2all-backend "$BACK" \
--all2all-backend $BACK \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \
--port "$PORT" &
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -18,18 +18,15 @@ wait_for_server() {
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS and platform-specific args based on platform
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
PLATFORM_ARGS=()
fi
cleanup() {
@@ -51,20 +48,19 @@ for BACK in "${BACKENDS[@]}"; do
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend "$BACK" \
--all2all-backend $BACK \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \
--port "$PORT" &
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,248 +0,0 @@
#!/bin/bash
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
# evaluation against a local vLLM server.
#
# Usage:
# # Run with defaults (gpt-oss-20b, multi_turn)
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Run with gpt-oss-120b and multiple test categories
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# Environment variables (all optional, with defaults):
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
# BFCL_PORT - Server port (default: 8000)
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
# BFCL_EXTRA_ARGS - Additional vLLM server args
set -euo pipefail
# ---- Configuration ----
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
TP_SIZE="${BFCL_TP_SIZE:-1}"
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
PORT="${BFCL_PORT:-8000}"
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
# Set up output directory
if [ -n "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
fi
echo "============================================"
echo "BFCL Tool Call Correctness Evaluation"
echo "============================================"
echo "Model: $MODEL"
echo "Tool parser: $TOOL_CALL_PARSER"
echo "API type: $API_TYPE"
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
echo "Test category: $TEST_CATEGORY"
echo "TP size: $TP_SIZE"
echo "Max model len: $MAX_MODEL_LEN"
echo "Port: $PORT"
echo "Num threads: $NUM_THREADS"
echo "============================================"
# ---- Install bfcl-eval if missing ----
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
echo "Installing bfcl-eval..."
pip install "bfcl-eval>=2025.10.20.1,<2026"
fi
# ---- Cleanup handler ----
SERVER_PID=""
cleanup() {
if [ -n "$SERVER_PID" ]; then
echo "Stopping vLLM server (pid=$SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
# Remove BFCL lock files (created by filelock for thread-safe writes)
rm -rf .file_locks/
if [ -n "${OUTPUT_DIR:-}" ]; then
rm -rf "$OUTPUT_DIR/.file_locks/"
fi
}
trap cleanup EXIT
# ---- Start vLLM server ----
echo "Starting vLLM server..."
SERVE_ARGS=(
"$MODEL"
--port "$PORT"
--enable-auto-tool-choice
--tool-call-parser "$TOOL_CALL_PARSER"
--tensor-parallel-size "$TP_SIZE"
--max-model-len "$MAX_MODEL_LEN"
--enforce-eager
--no-enable-prefix-caching
)
# Append reasoning parser if specified
if [ -n "$REASONING_PARSER" ]; then
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
fi
# Append any extra args
if [ -n "$EXTRA_ARGS" ]; then
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
fi
echo "Command: vllm serve ${SERVE_ARGS[*]}"
vllm serve "${SERVE_ARGS[@]}" &
SERVER_PID=$!
# ---- Wait for server to be ready ----
echo "Waiting for vLLM server to start (timeout: 600s)..."
SECONDS_WAITED=0
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
if [ $SECONDS_WAITED -ge 600 ]; then
echo ""
echo "ERROR: vLLM server failed to start within 600s"
exit 1
fi
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
fi
sleep 2
SECONDS_WAITED=$((SECONDS_WAITED + 2))
done
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
# ---- Run BFCL evaluation ----
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
# against our local vLLM server.
bfcl_exit_code=0
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
import os
import sys
model = sys.argv[1]
test_category = sys.argv[2]
num_threads = int(sys.argv[3])
port = sys.argv[4]
api_type = sys.argv[5]
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
os.environ["OPENAI_API_KEY"] = "dummy"
os.environ["BFCL_PROJECT_ROOT"] = output_dir
import bfcl_eval.constants.model_config as bfcl_model_config
from bfcl_eval.constants.model_config import ModelConfig
from bfcl_eval.model_handler.api_inference.openai_completion import (
OpenAICompletionsHandler,
)
from bfcl_eval.model_handler.api_inference.openai_response import (
OpenAIResponsesHandler,
)
if api_type == "responses":
handler = OpenAIResponsesHandler
else:
handler = OpenAICompletionsHandler
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
model_name=model,
display_name=f"{model} (FC) (vLLM)",
url=f"https://huggingface.co/{model}",
org="",
license="apache-2.0",
model_handler=handler,
input_price=None,
output_price=None,
is_fc_model=True,
underscore_to_dot=True,
)
from bfcl_eval.__main__ import evaluate, generate
import inspect
import typer
def _get_default_kwargs(function):
kwargs = {}
for k, v in inspect.signature(function).parameters.items():
if v.default is not inspect.Parameter.empty:
default = v.default
if isinstance(default, typer.models.OptionInfo):
default = default.default
kwargs[k] = default
return kwargs
# ---- generate ----
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
gen_kwargs = _get_default_kwargs(generate)
gen_kwargs["model"] = [model]
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
gen_kwargs["skip_server_setup"] = True
gen_kwargs["num_threads"] = num_threads
generate(**gen_kwargs)
# ---- evaluate ----
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
eval_kwargs = _get_default_kwargs(evaluate)
eval_kwargs["model"] = [model]
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
evaluate(**eval_kwargs)
print("=== BFCL evaluation completed successfully ===")
PYEOF
# ---- Upload results to buildkite ----
if command -v buildkite-agent &>/dev/null; then
if [ $bfcl_exit_code -eq 0 ]; then
STYLE="success"
STATUS="PASSED"
else
STYLE="error"
STATUS="FAILED"
fi
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
EOF
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
# $BFCL_PROJECT_ROOT/score/
RESULTS_ROOT="${OUTPUT_DIR:-.}"
if [ -d "$RESULTS_ROOT/result" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
fi
if [ -d "$RESULTS_ROOT/score" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
fi
fi
exit $bfcl_exit_code

View File

@@ -9,11 +9,10 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
# shellcheck source=/dev/null
source "$ENV_FILE"
source $ENV_FILE
remove_docker_container() {
docker rm -f "$CONTAINER_NAME" || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
--env-file "$ENV_FILE" \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
-e MODEL="$MODEL" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name "$CONTAINER_NAME" \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \

View File

@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
vllm serve "$MODEL" \
vllm serve $MODEL \
--seed 42 \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir "$DOWNLOAD_DIR" \
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for _ in {1..120}; do
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len "$INPUT_LEN" \
--sonnet-output-len "$OUTPUT_LEN" \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."

View File

@@ -1,227 +0,0 @@
#!/bin/bash
#
# trigger-ci-build.sh
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
#
# Usage: ./trigger-ci-build.sh [options]
#
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
#
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default configuration
PIPELINE="ci"
DRY_RUN=true
usage() {
cat <<EOF
Usage: $(basename "$0") [options]
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
Options:
--execute Actually trigger the build (default: dry-run)
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
--commit Override commit SHA (default: current HEAD)
--branch Override branch name (default: current branch)
--message Custom build message (default: auto-generated)
--help Show this help message
Prerequisites:
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
- bk configured: bk configure
Examples:
$(basename "$0") # Dry-run, show what would happen
$(basename "$0") --execute # Actually trigger the build
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Parse arguments
COMMIT=""
BRANCH=""
MESSAGE=""
while [[ $# -gt 0 ]]; do
case $1 in
--execute)
DRY_RUN=false
shift
;;
--pipeline)
PIPELINE="$2"
shift 2
;;
--commit)
COMMIT="$2"
shift 2
;;
--branch)
BRANCH="$2"
shift 2
;;
--message)
MESSAGE="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
log_error "Unexpected argument: $1"
usage
;;
esac
done
# Check if bk CLI is installed
if ! command -v bk &>/dev/null; then
log_error "Buildkite CLI (bk) is not installed"
echo ""
echo "Install with:"
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
echo ""
echo "Then configure:"
echo " bk configure"
exit 1
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Get current commit and branch if not overridden
if [[ -z "$COMMIT" ]]; then
COMMIT=$(git rev-parse HEAD)
fi
if [[ -z "$BRANCH" ]]; then
BRANCH=$(git branch --show-current)
if [[ -z "$BRANCH" ]]; then
# Detached HEAD state - try to get branch from ref
BRANCH=$(git rev-parse --abbrev-ref HEAD)
fi
fi
# Generate default message if not provided
if [[ -z "$MESSAGE" ]]; then
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
MESSAGE="[Manual] ${COMMIT_MSG}"
fi
# Safety check: Verify the commit exists on the remote
log_info "Verifying commit exists on remote..."
git fetch origin --quiet 2>/dev/null || true
# Check if commit is reachable from any remote branch
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
if [[ -z "$REMOTE_BRANCHES" ]]; then
log_error "Commit ${COMMIT} does not exist on any remote branch!"
echo ""
echo "The CI system will fail to checkout this commit."
echo "Please push your changes first:"
echo ""
echo " git push origin ${BRANCH}"
echo ""
exit 1
fi
log_success "Commit found on remote branches:"
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
echo " ... and more"
fi
echo ""
log_info "Pipeline: ${PIPELINE}"
log_info "Branch: ${BRANCH}"
log_info "Commit: ${COMMIT}"
log_info "Message: ${MESSAGE}"
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
echo ""
# Build the command
CMD=(bk build create
-y
-w
-i
--pipeline "${PIPELINE}"
--commit "${COMMIT}"
--branch "${BRANCH}"
--message "${MESSAGE}"
--env "RUN_ALL=1"
--env "NIGHTLY=1"
)
if [[ "$DRY_RUN" == true ]]; then
echo "=========================================="
log_warn "DRY-RUN MODE - No build will be triggered"
echo "=========================================="
echo ""
echo "Command that would be executed:"
echo ""
# Escape single quotes in values for safe shell display
escape_for_shell() {
printf '%s' "$1" | sed "s/'/'\\\\''/g"
}
echo " bk build create \\"
echo " -y \\"
echo " -w \\"
echo " -i \\"
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
echo " --env 'RUN_ALL=1' \\"
echo " --env 'NIGHTLY=1'"
echo ""
echo "=========================================="
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
echo ""
echo " $0 --execute"
echo "=========================================="
exit 0
fi
log_info "Triggering build..."
# Execute the command - bk will print the URL and open browser
"${CMD[@]}"

View File

@@ -72,19 +72,20 @@ obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indices for all existing wheels
# call script to generate indicies for all existing wheels
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
alias_args=()
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
alias_arg=""
fi
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "${INDICES_OUTPUT_DIR:?}/*"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION"
if [[ -z "$GIT_VERSION" ]]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [[ -z "$PYPI_TOKEN" ]]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
set -x # avoid printing secrets above
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source distribution using setup.py
python setup.py sdist --dist-dir=$DIST_DIR
ls -la $DIST_DIR
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
echo "Found sdist: $SDIST_FILE"
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
echo "Wheels and source distribution uploaded to PyPI"

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl

View File

@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -4,10 +4,8 @@ depends_on:
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
device: h100
gpu: h100
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
@@ -15,11 +13,9 @@ steps:
- label: V1 attention (B200)
timeout_in_minutes: 30
device: b200
gpu: b200
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this

View File

@@ -17,15 +17,3 @@ steps:
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Attention Benchmarks Smoke Test (B200)
device: b200
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

View File

@@ -2,210 +2,56 @@ group: Compile
depends_on:
- image-build
steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
timeout_in_minutes: 50
- label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
num_devices: 2
source_file_dependencies:
- vllm/model_executor/layers/
- vllm/compilation/
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- tests/compile/correctness_e2e/test_sequence_parallel.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: Sequence Parallel Correctness Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: AsyncTP Correctness Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: AsyncTP Correctness Tests (B200)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: b200
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/model_executor/layers
- tests/compile/passes/distributed/
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -s -v tests/compile/passes/distributed
- label: Fusion and Compile Unit Tests (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
- nvidia-smi
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
# TODO(luka) move to H100 once pass tests run on H100
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E Quick (H100)
timeout_in_minutes: 15
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
- label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label: Fusion E2E Config Sweep (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
num_devices: 1
gpu: b200
optional: true
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
num_gpus: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
- label: Fusion E2E TP2 (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"

View File

@@ -9,7 +9,6 @@ steps:
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- pytest -v -s cuda/test_platform_no_cuda_init.py
- label: Cudagraph
timeout_in_minutes: 20

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
@@ -16,9 +16,9 @@ steps:
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs)
timeout_in_minutes: 60
timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
@@ -47,21 +47,26 @@ steps:
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,27 +84,6 @@ steps:
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_utils
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -107,32 +91,20 @@ steps:
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- label: Distributed Compile + Comm (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- tests/distributed/test_symm_mem_allreduce.py
- tests/distributed/test_multiproc_executor.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node)
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
device: h100
num_devices: 8
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
@@ -148,9 +120,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
device: a100
gpu: a100
optional: true
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
@@ -161,23 +133,26 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H100)
timeout_in_minutes: 15
device: h100
- label: Distributed Tests (2 GPUs)(H200)
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_devices: 2
num_gpus: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
device: b200
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_devices: 2
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -186,10 +161,8 @@ steps:
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
num_nodes: 2
no_plugin: true
optional: true # TODO: revert once infra issue solved
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
@@ -198,12 +171,12 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
@@ -211,45 +184,10 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
timeout_in_minutes: 30
device: a100
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- vllm/v1/worker/kv_connector_model_runner_mixin.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
@@ -258,4 +196,4 @@ steps:
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- pytest -v -s distributed/test_pipeline_parallel.py

View File

@@ -4,36 +4,39 @@ depends_on:
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
device: h100
gpu: h100
optional: true
num_devices: 4
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
device: h100
gpu: h100
optional: true
num_devices: 4
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
device: b200
gpu: b200
optional: true
num_devices: 2
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
timeout_in_minutes: 60
device: h100
- label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30
optional: true
num_devices: 1
soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -1,5 +1,5 @@
group: Engine
depends_on:
depends_on:
- image-build
steps:
- label: Engine
@@ -14,59 +14,13 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: Engine (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/engine/
- tests/v1/engine/
commands:
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 2
- label: V1 e2e + engine
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/v1/e2e
- tests/v1
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
depends_on:
- image-build-amd
- label: V1 e2e (4 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 4
source_file_dependencies:
- vllm/
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4
depends_on:
- image-build-amd
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine

View File

@@ -34,26 +34,23 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/rpc
- tests/entrypoints/instrumentator
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling)
@@ -82,11 +79,6 @@ steps:
- tests/v1
commands:
- pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: OpenAI API Correctness
timeout_in_minutes: 30

View File

@@ -14,25 +14,10 @@ steps:
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
- label: Elastic EP Scaling Test
timeout_in_minutes: 20
device: b200
optional: true
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/compilation/
- tests/distributed/
commands:
- pytest -v -s distributed/test_elastic_ep.py
- pytest -v -s distributed/test_eplb_spec_decode.py

View File

@@ -8,17 +8,15 @@ steps:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -45,8 +43,7 @@ steps:
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test
@@ -60,8 +57,8 @@ steps:
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
device: h100
num_devices: 1
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
@@ -72,7 +69,7 @@ steps:
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -80,7 +77,7 @@ steps:
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
@@ -88,7 +85,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -97,7 +94,7 @@ steps:
- vllm/platforms/cuda.py
commands:
- nvidia-smi
- python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -117,54 +114,4 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Kernels Helion Test
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
device: h100
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
device: h100
num_devices: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

View File

@@ -11,22 +11,22 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
# - label: LM Eval Large Models (4 GPUs)(A100)
# device: a100
# optional: true
# num_devices: 4
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
# source_file_dependencies:
# - csrc/
# - vllm/model_executor/layers/quantization
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
device: h100
gpu: h100
optional: true
num_devices: 4
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
@@ -37,65 +37,10 @@ steps:
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
device: b200
gpu: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200
optional: true
num_devices: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
- label: GPQA Eval (GPT-OSS) (H100)
timeout_in_minutes: 120
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
- label: GPQA Eval (GPT-OSS) (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt

View File

@@ -14,7 +14,7 @@ steps:
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_devices: 4
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora

View File

@@ -9,7 +9,6 @@ steps:
- tests/v1
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
@@ -17,8 +16,7 @@ steps:
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
# TODO: create another `optional` test group for slow tests
- pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
@@ -27,19 +25,13 @@ steps:
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: V1 Others (CPU)
depends_on:
- image-build-cpu
depends_on: ~
source_file_dependencies:
- vllm/
- tests/v1
device: cpu
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
@@ -67,20 +59,19 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
# for basic
- python3 basic/offline_inference/chat.py
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 basic/offline_inference/classify.py
- python3 basic/offline_inference/embed.py
- python3 basic/offline_inference/score.py
- python3 offline_inference/basic/chat.py # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/embed/vision_embedding_offline.py --seed 0
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
@@ -91,7 +82,7 @@ steps:
- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
num_devices: 2
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
@@ -116,48 +107,53 @@ steps:
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/detokenizer
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s detokenizer
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on:
- image-build-cpu
depends_on: ~
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/test_pooling_params.py
- tests/test_ray_env.py
- tests/multimodal
- tests/renderers
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils
- tests/config
device: cpu
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s test_pooling_params.py
- pytest -v -s test_ray_env.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100)
timeout_in_minutes: 25
device: h100
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
@@ -166,18 +162,4 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: Acceptance Length Test (Large Models) # optional
timeout_in_minutes: 25
gpu: h100
optional: true
num_gpus: 1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/model_executor/models/mlp_speculator.py
- tests/v1/spec_decode/test_acceptance_length.py
commands:
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

View File

@@ -9,9 +9,9 @@ steps:
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

View File

@@ -1,110 +0,0 @@
group: Model Runner V2
depends_on:
- image-build
steps:
- label: Model Runner V2 Core Tests
timeout_in_minutes: 45
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- vllm/v1/core/sched/
- vllm/v1/attention/
- tests/v1/engine/test_llm_engine.py
- tests/v1/e2e/
- tests/v1/entrypoints/llm/test_struct_output_generate.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- label: Model Runner V2 Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/core/sched/
- vllm/v1/worker/gpu_worker.py
- examples/offline_inference/
- examples/basic/offline_inference/
- examples/pooling/embed/vision_embedding_offline.py
- examples/others/tensorize_vllm_model.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pip install tensorizer # for tensorizer test
- python3 basic/offline_inference/chat.py # for basic
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
#- python3 basic/offline_inference/embed.py # TODO
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Model Runner V2 Distributed (2 GPUs)
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/basic_correctness/test_basic_correctness.py
- tests/v1/distributed/test_async_llm_dp.py
- tests/v1/distributed/test_eagle_dp.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
# These require fix https://github.com/vllm-project/vllm/pull/36280
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/distributed/test_pipeline_parallel.py
#- tests/distributed/test_pp_cudagraph.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
# TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
#- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
- label: Model Runner V2 Spec Decode
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/spec_decode/test_spec_decode.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"

View File

@@ -4,6 +4,7 @@ depends_on:
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -15,6 +16,7 @@ steps:
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
@@ -31,27 +33,18 @@ steps:
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/test_terratorch.py
- tests/models/test_transformers.py
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
depends_on:
- image-build-cpu
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
device: cpu
no_gpu: true
commands:
- pytest -v -s models/test_utils.py models/test_vision.py
@@ -65,7 +58,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Model Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/

View File

@@ -4,6 +4,7 @@ depends_on:
steps:
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -15,6 +16,7 @@ steps:
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -37,7 +40,7 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
@@ -52,21 +56,13 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
@@ -76,20 +72,17 @@ steps:
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Language Models Test (MTEB)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/

View File

@@ -2,75 +2,23 @@ group: Models - Multimodal
depends_on:
- image-build
steps:
- label: "Multi-Modal Models (Standard) 1: qwen2"
timeout_in_minutes: 45
- label: Multi-Modal Models (Standard) # 60min
timeout_in_minutes: 80
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
- pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 4: other + whisper"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Processor Test (CPU)
depends_on:
- image-build-cpu
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
- tests/models/registry.py
device: cpu
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
@@ -80,7 +28,6 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/multimodal
- tests/models/registry.py
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -103,11 +50,6 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Models (Extended) 2
optional: true
@@ -126,3 +68,12 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

View File

@@ -5,7 +5,7 @@ steps:
- label: Plugin Tests (2 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
@@ -15,17 +15,10 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# begin io_processor plugins test
# test generic io_processor plugins functions
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
# test Terratorch io_processor plugins
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# test bge_m3_sparse io_processor plugin
- pip install -e ./plugins/bge_m3_sparse_plugin
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
- pip uninstall bge_m3_sparse_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -36,6 +29,6 @@ steps:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

View File

@@ -3,7 +3,7 @@ depends_on:
- image-build
steps:
- label: PyTorch Compilation Unit Tests
timeout_in_minutes: 10
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
@@ -17,16 +17,8 @@ steps:
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Compilation Passes Unit Tests
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/compile/passes
commands:
- pytest -s -v compile/passes --ignore compile/passes/distributed
- label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 35
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
@@ -38,13 +30,16 @@ steps:
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph
timeout_in_minutes: 30
timeout_in_minutes: 40
source_file_dependencies:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some

View File

@@ -16,14 +16,14 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: Quantized MoE Test (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
device: b200
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py

View File

@@ -1,16 +0,0 @@
group: Ray Compatibility
depends_on:
- image-build
steps:
- label: Ray Dependency Compatibility Check
# Informational only — does not block the pipeline.
# If this fails, it means the PR introduces a dependency that
# conflicts with Ray's dependency constraints.
# See https://github.com/vllm-project/vllm/issues/33599
soft_fail: true
timeout_in_minutes: 10
source_file_dependencies:
- requirements/
- setup.py
commands:
- bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh

View File

@@ -12,10 +12,3 @@ steps:
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s samplers

View File

@@ -1,40 +0,0 @@
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"

View File

@@ -5,7 +5,7 @@ steps:
- label: Weight Loading Multiple GPU # 33min
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_devices: 2
num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
@@ -13,13 +13,13 @@ steps:
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
# - label: Weight Loading Multiple GPU - Large Models # optional
# working_dir: "/vllm-workspace/tests"
# num_devices: 2
# device: a100
# optional: true
# source_file_dependencies:
# - vllm/
# - tests/weight_loading
# commands:
# - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: Weight Loading Multiple GPU - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

24
.github/.bc-linter.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
version: 1
paths:
# We temporarily disable globally, and will only enable with `annotations.include`
# include:
# - "vllm/v1/attetion/*.py"
# - "vllm/v1/core/*.py"
exclude:
- "**/*.py"
scan:
functions: true # check free functions and methods
classes: true # check classes/dataclasses
public_only: true # ignore names starting with "_" at any level
annotations:
include: # decorators that forceinclude a symbol
- name: "bc_linter_include" # matched by simple name or dotted suffix
propagate_to_members: false # for classes, include methods/inner classes
exclude: # decorators that forceexclude a symbol
- name: "bc_linter_skip" # matched by simple name or dotted suffix
propagate_to_members: true # for classes, exclude methods/inner classes
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]

63
.github/CODEOWNERS vendored
View File

@@ -2,66 +2,43 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
/vllm/lora @jeejeelee
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @heheda12345
# Entrypoints
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
/vllm/entrypoints/mcp @heheda12345
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
/vllm/entrypoints/openai/realtime @njhill
/vllm/entrypoints/openai/speech_to_text @NickLucche
/vllm/entrypoints/pooling @noooop
/vllm/entrypoints/sagemaker @DarkLight1337
/vllm/entrypoints/serve @njhill
/vllm/entrypoints/*.py @njhill
/vllm/entrypoints/chat_utils.py @DarkLight1337
/vllm/entrypoints/llm.py @DarkLight1337
# Input/Output Processing
/vllm/sampling_params.py @njhill @NickLucche
/vllm/pooling_params.py @noooop @DarkLight1337
/vllm/tokenizers @DarkLight1337 @njhill
/vllm/renderers @DarkLight1337 @njhill
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1/attention @LucasWilkinson @MatthewBonanni
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/kv_offload @ApostaC @orozery
/vllm/v1/engine @njhill
/vllm/v1/executor @njhill
/vllm/v1/worker @njhill
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
/vllm/v1/offloading @ApostaC
# Model runner V2
/vllm/v1/worker/gpu @WoosukKwon @njhill
/vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/worker/gpu @WoosukKwon
# Test ownership
/.buildkite/lm-eval-harness @mgoin
@@ -77,13 +54,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC @orozery
/tests/v1/kv_offload @ApostaC @orozery
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
/tests/v1/determinism @yewentao256
# Transformers modeling backend
@@ -136,8 +113,8 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/tokenizers/mistral.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -173,7 +150,9 @@ mkdocs.yaml @hmellor
/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler @noooop
# Security guide and policies

28
.github/mergify.yml vendored
View File

@@ -3,7 +3,6 @@ pull_request_rules:
description: Automatically apply documentation label
conditions:
- label != stale
- -closed
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
Hi @{{author}}, the pre-commit checks have failed. Please run:
```bash
uv pip install pre-commit>=4.5.1
uv pip install pre-commit
pre-commit install
pre-commit run --all-files
```
@@ -38,13 +37,15 @@ pull_request_rules:
> [!TIP]
> <details>
> <summary>Is <code>mypy</code> failing?</summary>
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
> <br/>
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
>
> ```bash
> # For mypy (substitute "3.10" with the failing version if needed)
> pre-commit run --hook-stage manual mypy-3.10
> # For markdownlint
> pre-commit run --hook-stage manual markdownlint
> ```
> </details>
@@ -258,7 +259,8 @@ pull_request_rules:
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/structured_outputs/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
@@ -334,7 +336,7 @@ pull_request_rules:
- or:
- files~=^tests/tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
@@ -381,7 +383,7 @@ pull_request_rules:
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:
@@ -412,18 +414,6 @@ pull_request_rules:
remove:
- needs-rebase
- name: label-bug
description: Automatically apply bug label
conditions:
- label != stale
- or:
- title~=(?i)\bbug\b
- title~=(?i)\bbugfix\b
actions:
label:
add:
- bug
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:

29
.github/workflows/bc-lint.yml vendored Normal file
View File

@@ -0,0 +1,29 @@
name: BC Lint
on:
pull_request:
types:
- opened
- synchronize
- reopened
- labeled
- unlabeled
jobs:
bc_lint:
if: github.repository_owner == 'vllm-project'
runs-on: ubuntu-latest
steps:
- name: Run BC Lint Action
uses: pytorch/test-infra/.github/actions/bc-lint@main
with:
repo: ${{ github.event.pull_request.head.repo.full_name }}
base_sha: ${{ github.event.pull_request.base.sha }}
head_sha: ${{ github.event.pull_request.head.sha }}
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
config_dir: .github
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true

Some files were not shown because too many files have changed in this diff Show More