Compare commits
9 Commits
main
...
v0.13.0rc3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f124b56786 | ||
|
|
d78e128b8b | ||
|
|
761b730dcb | ||
|
|
f34eca5f01 | ||
|
|
4cd332f3cf | ||
|
|
16484d394c | ||
|
|
e397bd6592 | ||
|
|
6a88d590bb | ||
|
|
ad8c073131 |
@@ -1,8 +1,7 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/image_build"
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/hardware_tests"
|
||||
- ".buildkite/image_build"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
name: vllm_intel_ci
|
||||
job_dirs:
|
||||
- ".buildkite/intel_jobs"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
- "requirements/common.txt"
|
||||
- "requirements/xpu.txt"
|
||||
- "requirements/build.txt"
|
||||
- "requirements/test.txt"
|
||||
- "setup.py"
|
||||
- "csrc/"
|
||||
- "cmake/"
|
||||
run_all_exclude_patterns:
|
||||
- "docker/Dockerfile."
|
||||
- "csrc/cpu/"
|
||||
- "csrc/rocm/"
|
||||
- "cmake/hipify.py"
|
||||
- "cmake/cpu_extension.cmake"
|
||||
registries: public.ecr.aws/q9t5s3a7
|
||||
repositories:
|
||||
main: "vllm-ci-test-repo"
|
||||
premerge: "vllm-ci-test-repo"
|
||||
@@ -1,22 +0,0 @@
|
||||
group: Hardware - AMD Build
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
key: image-build-amd
|
||||
depends_on: []
|
||||
device: amd_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- >
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
--target test
|
||||
--no-cache
|
||||
--progress plain .
|
||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
@@ -1,10 +0,0 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Ascend NPU Test"
|
||||
soft_fail: true
|
||||
timeout_in_minutes: 20
|
||||
no_plugin: true
|
||||
device: ascend_npu
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
||||
@@ -1,110 +0,0 @@
|
||||
group: CPU
|
||||
depends_on: []
|
||||
steps:
|
||||
- label: CPU-Kernel Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- cmake/cpu_extension.cmake
|
||||
- CMakeLists.txt
|
||||
- vllm/_custom_ops.py
|
||||
- tests/kernels/attention/test_cpu_attn.py
|
||||
- tests/kernels/moe/test_cpu_fused_moe.py
|
||||
- tests/kernels/test_onednn.py
|
||||
- tests/kernels/test_awq_int4_to_int8.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
|
||||
|
||||
- label: CPU-Compatibility Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- cmake/cpu_extension.cmake
|
||||
- setup.py
|
||||
- vllm/platforms/cpu.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
|
||||
|
||||
- label: CPU-Language Generation and Pooling Model Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/
|
||||
- tests/models/language/generation/
|
||||
- tests/models/language/pooling/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model"
|
||||
|
||||
- label: CPU-Quantization Model Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/model_executor/layers/quantization/cpu_wna16.py
|
||||
- vllm/model_executor/layers/quantization/gptq_marlin.py
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
|
||||
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
|
||||
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
|
||||
- tests/quantization/test_compressed_tensors.py
|
||||
- tests/quantization/test_cpu_wna16.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
|
||||
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
- label: CPU-Distributed Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/shm.cpp
|
||||
- vllm/v1/worker/cpu_worker.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- vllm/v1/worker/cpu_model_runner.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- vllm/platforms/cpu.py
|
||||
- vllm/distributed/parallel_state.py
|
||||
- vllm/distributed/device_communicators/cpu_communicator.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
|
||||
|
||||
- label: CPU-Multi-Modal Model Tests %N
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
# - vllm/
|
||||
- vllm/model_executor/layers/rotary_embedding
|
||||
- tests/models/multimodal/generation/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
|
||||
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
|
||||
parallelism: 2
|
||||
|
||||
- label: "Arm CPU Test"
|
||||
depends_on: []
|
||||
soft_fail: false
|
||||
device: arm_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
||||
@@ -1,10 +0,0 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "GH200 Test"
|
||||
soft_fail: true
|
||||
device: gh200
|
||||
no_plugin: true
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
||||
@@ -1,17 +0,0 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Intel HPU Test"
|
||||
soft_fail: true
|
||||
device: intel_hpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
||||
|
||||
- label: "Intel GPU Test"
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
||||
@@ -1,255 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
set -e
|
||||
|
||||
# replace invalid characters in Docker image tags and truncate to 128 chars
|
||||
clean_docker_tag() {
|
||||
local input="$1"
|
||||
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
||||
}
|
||||
|
||||
print_usage_and_exit() {
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
print_instance_info() {
|
||||
echo ""
|
||||
echo "=== Debug: Instance Information ==="
|
||||
# Get IMDSv2 token
|
||||
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
||||
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
||||
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
||||
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
||||
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
||||
echo "AMI ID: ${AMI_ID}"
|
||||
echo "Instance Type: ${INSTANCE_TYPE}"
|
||||
echo "Instance ID: ${INSTANCE_ID}"
|
||||
echo "AZ: ${AZ}"
|
||||
else
|
||||
echo "Not running on EC2 or IMDS not available"
|
||||
fi
|
||||
# Check for warm cache AMI (marker file baked into custom AMI)
|
||||
if [[ -f /etc/vllm-ami-info ]]; then
|
||||
echo "Cache: warm (custom vLLM AMI)"
|
||||
cat /etc/vllm-ami-info
|
||||
else
|
||||
echo "Cache: cold (standard AMI)"
|
||||
fi
|
||||
echo "==================================="
|
||||
echo ""
|
||||
}
|
||||
|
||||
setup_buildx_builder() {
|
||||
echo "--- :buildkite: Setting up buildx builder"
|
||||
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
||||
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
||||
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
||||
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
||||
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
||||
echo "Using existing baked-vllm-builder"
|
||||
docker buildx use baked-vllm-builder
|
||||
else
|
||||
echo "Creating baked-vllm-builder with remote driver"
|
||||
docker buildx create \
|
||||
--name baked-vllm-builder \
|
||||
--driver remote \
|
||||
--use \
|
||||
"unix://${BUILDKIT_SOCKET}"
|
||||
fi
|
||||
docker buildx inspect --bootstrap
|
||||
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
||||
# Existing builder available
|
||||
echo "Using existing builder: ${BUILDER_NAME}"
|
||||
docker buildx use "${BUILDER_NAME}"
|
||||
docker buildx inspect --bootstrap
|
||||
else
|
||||
# No local buildkitd, no existing builder - create new docker-container builder
|
||||
echo "No local buildkitd found, using docker-container driver"
|
||||
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
fi
|
||||
|
||||
# builder info
|
||||
echo "Active builder:"
|
||||
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
||||
}
|
||||
|
||||
check_and_skip_if_image_exists() {
|
||||
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
||||
echo "--- :mag: Checking if image exists"
|
||||
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
||||
echo "Image already exists: ${IMAGE_TAG}"
|
||||
echo "Skipping build"
|
||||
exit 0
|
||||
fi
|
||||
echo "Image not found, proceeding with build"
|
||||
fi
|
||||
}
|
||||
|
||||
ecr_login() {
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
}
|
||||
|
||||
prepare_cache_tags() {
|
||||
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
||||
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
||||
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
||||
|
||||
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
cache="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
||||
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
||||
fi
|
||||
CACHE_TO="$cache"
|
||||
CACHE_FROM="$cache"
|
||||
CACHE_FROM_BASE_BRANCH="$cache"
|
||||
else
|
||||
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
||||
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
||||
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
||||
fi
|
||||
fi
|
||||
|
||||
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
||||
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
||||
}
|
||||
|
||||
resolve_parent_commit() {
|
||||
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
||||
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
||||
if [[ -n "${PARENT_COMMIT}" ]]; then
|
||||
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
||||
export PARENT_COMMIT
|
||||
else
|
||||
echo "Could not determine parent commit (may be first commit in repo)"
|
||||
fi
|
||||
else
|
||||
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
||||
fi
|
||||
}
|
||||
|
||||
print_bake_config() {
|
||||
echo "--- :page_facing_up: Resolved bake configuration"
|
||||
# Write to a temp directory to avoid polluting the repo root (which is the
|
||||
# Docker build context). Files left in the repo root get COPY'd into the
|
||||
# image and can cause duplicate artifact uploads from downstream steps.
|
||||
local bake_tmp
|
||||
bake_tmp="$(mktemp -d)"
|
||||
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
|
||||
}
|
||||
|
||||
#################################
|
||||
# Main Script #
|
||||
#################################
|
||||
print_instance_info
|
||||
|
||||
if [[ $# -lt 5 ]]; then
|
||||
print_usage_and_exit
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# input args
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
IMAGE_TAG=$5
|
||||
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
|
||||
# build config
|
||||
TARGET="test-ci"
|
||||
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
|
||||
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
||||
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
||||
CI_HCL_PATH="/tmp/ci.hcl"
|
||||
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
prepare_cache_tags
|
||||
ecr_login
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
|
||||
# Environment info (for docs and human readers)
|
||||
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
||||
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
||||
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
||||
#
|
||||
# Build configuration (exported as environment variables for bake):
|
||||
export BUILDKITE_COMMIT
|
||||
export PARENT_COMMIT
|
||||
export IMAGE_TAG
|
||||
export IMAGE_TAG_LATEST
|
||||
export CACHE_FROM
|
||||
export CACHE_FROM_BASE_BRANCH
|
||||
export CACHE_FROM_MAIN
|
||||
export CACHE_TO
|
||||
|
||||
# print args
|
||||
echo "--- :mag: Arguments"
|
||||
echo "REGISTRY: ${REGISTRY}"
|
||||
echo "REPO: ${REPO}"
|
||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||
echo "BRANCH: ${BRANCH}"
|
||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||
|
||||
# print build configuration
|
||||
echo "--- :mag: Build configuration"
|
||||
echo "TARGET: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
||||
echo "CI_HCL_URL: ${CI_HCL_URL}"
|
||||
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
||||
|
||||
echo "--- :mag: Cache tags"
|
||||
echo "CACHE_TO: ${CACHE_TO}"
|
||||
echo "CACHE_FROM: ${CACHE_FROM}"
|
||||
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
||||
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||
|
||||
check_and_skip_if_image_exists
|
||||
|
||||
echo "--- :docker: Setting up Docker buildx bake"
|
||||
echo "Target: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "CI HCL path: ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
|
||||
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "Make sure you're running from the vLLM repository root"
|
||||
exit 1
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "--- :arrow_down: Downloading ci.hcl"
|
||||
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
||||
echo "Downloaded to ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${CI_HCL_PATH}" ]]; then
|
||||
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
|
||||
exit 1
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
fi
|
||||
|
||||
setup_buildx_builder
|
||||
|
||||
resolve_parent_commit
|
||||
export PARENT_COMMIT
|
||||
|
||||
print_bake_config
|
||||
|
||||
echo "--- :docker: Building ${TARGET}"
|
||||
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
||||
|
||||
echo "--- :white_check_mark: Build complete"
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
|
||||
@@ -3,9 +3,8 @@ steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
timeout_in_minutes: 600
|
||||
commands:
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
@@ -41,7 +40,7 @@ steps:
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,11 +24,13 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--build-arg VLLM_CPU_X86=true \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,10 +24,10 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -25,10 +25,10 @@ fi
|
||||
docker build \
|
||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||
--progress plain \
|
||||
https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# push
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# skip build if image already exists
|
||||
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build \
|
||||
--file docker/Dockerfile.xpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu
|
||||
@@ -1,64 +0,0 @@
|
||||
group: Intel
|
||||
steps:
|
||||
- label: ":docker: Build XPU image"
|
||||
soft_fail: true
|
||||
depends_on: []
|
||||
key: image-build-xpu
|
||||
commands:
|
||||
- bash -lc '.buildkite/image_build/image_build_xpu.sh "public.ecr.aws/q9t5s3a7" "vllm-ci-test-repo" "$BUILDKITE_COMMIT"'
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
- label: "XPU example Test"
|
||||
depends_on:
|
||||
- image-build-xpu
|
||||
timeout_in_minutes: 30
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
env:
|
||||
REGISTRY: "public.ecr.aws/q9t5s3a7"
|
||||
REPO: "vllm-ci-test-repo"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/intel_jobs/test-intel.yaml
|
||||
commands:
|
||||
- >-
|
||||
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
|
||||
'pip install tblib==3.1.0 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN &&
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 &&
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel'
|
||||
- label: "XPU V1 test"
|
||||
depends_on:
|
||||
- image-build-xpu
|
||||
timeout_in_minutes: 30
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
env:
|
||||
REGISTRY: "public.ecr.aws/q9t5s3a7"
|
||||
REPO: "vllm-ci-test-repo"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/intel_jobs/test-intel.yaml
|
||||
commands:
|
||||
- >-
|
||||
bash .buildkite/scripts/hardware_ci/run-intel-test.sh
|
||||
'cd tests &&
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py &&
|
||||
pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py &&
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py -k "not test_topk_only and not test_topp_only and not test_topk_and_topp" &&
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
|
||||
pytest -v -s v1/structured_output &&
|
||||
pytest -v -s v1/test_serial_utils.py &&
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py'
|
||||
@@ -1,9 +1,6 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
required_gpu_arch:
|
||||
- gfx942
|
||||
- gfx950
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.695
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.447
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -1,19 +0,0 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.7142
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.4579
|
||||
env_vars:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
kv_cache_dtype: fp8
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -1,9 +1,6 @@
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
required_gpu_arch:
|
||||
- gfx942
|
||||
- gfx950
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
||||
required_gpu_arch:
|
||||
- gfx942
|
||||
- gfx950
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.6353
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.637
|
||||
limit: null
|
||||
num_fewshot: null
|
||||
@@ -1,2 +1 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
@@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit "$LIMIT"
|
||||
--limit $LIMIT
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@@ -20,11 +20,14 @@ usage() {
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:f:t:" OPT; do
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
|
||||
@@ -13,11 +13,9 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@@ -62,12 +60,8 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
"allow_deprecated_quantization=True,"
|
||||
)
|
||||
|
||||
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
|
||||
model_args += "attention_backend=TRITON_ATTN"
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
@@ -90,40 +84,9 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
return results
|
||||
|
||||
|
||||
def _check_rocm_gpu_arch_requirement(eval_config):
|
||||
"""Skip the test if the model requires a ROCm GPU arch not present.
|
||||
|
||||
Model YAML configs can specify::
|
||||
|
||||
required_gpu_arch:
|
||||
- gfx942
|
||||
- gfx950
|
||||
|
||||
The check only applies on ROCm. On other platforms (e.g. CUDA) the
|
||||
field is ignored so that shared config files work for both NVIDIA and
|
||||
AMD CI pipelines.
|
||||
"""
|
||||
required_archs = eval_config.get("required_gpu_arch")
|
||||
if not required_archs:
|
||||
return
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
from vllm.platforms.rocm import _GCN_ARCH # noqa: E402
|
||||
|
||||
if not any(arch in _GCN_ARCH for arch in required_archs):
|
||||
pytest.skip(
|
||||
f"Model requires GPU arch {required_archs}, "
|
||||
f"but detected arch is '{_GCN_ARCH}'"
|
||||
)
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
|
||||
_check_rocm_gpu_arch_requirement(eval_config)
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
@@ -138,8 +101,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
|
||||
min_acceptable = ground_truth * (1 - rtol)
|
||||
success = success and measured_value >= min_acceptable
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
assert success
|
||||
|
||||
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
|
||||
Runtime environment variables:
|
||||
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
@@ -34,9 +34,8 @@ Runtime environment variables:
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
|
||||
|
||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
>
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
@@ -83,6 +82,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -175,6 +175,19 @@ If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
#### Performance Results Comparison
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||
|
||||
Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
|
||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
|
||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
||||
|
||||
A comparison diagram will be generated below the table.
|
||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -393,7 +393,7 @@ if __name__ == "__main__":
|
||||
with open(results_folder / md_file, "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/performance-benchmarks/"
|
||||
"performance-benchmarks-descriptions.md"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script should be run inside the CI process
|
||||
# This script assumes that we are already inside the vllm/ directory
|
||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||
|
||||
@@ -7,26 +9,14 @@
|
||||
set -x
|
||||
set -o pipefail
|
||||
|
||||
# Environment-driven debug controls (like ON_CPU=1)
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
MODEL_FILTER="${MODEL_FILTER:-}"
|
||||
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
||||
|
||||
# Adaptive search controls
|
||||
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
|
||||
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
|
||||
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
|
||||
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
|
||||
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
|
||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
@@ -35,9 +25,9 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
declare -g arch_suffix=''
|
||||
|
||||
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
@@ -54,16 +44,12 @@ check_cpus() {
|
||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
||||
if [[ $numa_count -gt 0 ]]; then
|
||||
echo "NUMA found."
|
||||
echo "$numa_count"
|
||||
echo $numa_count
|
||||
else
|
||||
echo "Need at least 1 NUMA to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
|
||||
declare -g gpu_type="arm64-cpu"
|
||||
else
|
||||
declare -g gpu_type="cpu"
|
||||
fi
|
||||
declare -g gpu_type="cpu"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@@ -122,12 +108,13 @@ json2envs() {
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
local timeout_val="1200"
|
||||
timeout "$timeout_val" bash -c '
|
||||
until curl -sf http://localhost:8000/v1/models >/dev/null; do
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done
|
||||
'
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
kill_processes_launched_by_current_bash() {
|
||||
@@ -190,318 +177,19 @@ upload_to_buildkite() {
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
# -------------------------------
|
||||
# Adaptive concurrency helpers
|
||||
# -------------------------------
|
||||
result_json_path_for_serving() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency=$3
|
||||
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
|
||||
}
|
||||
run_latency_tests() {
|
||||
# run latency tests using `vllm bench latency` command
|
||||
# $1: a json file specifying latency test cases
|
||||
|
||||
extract_metric_ms() {
|
||||
local metric_name=$1
|
||||
local json_file=$2
|
||||
local latency_test_file
|
||||
latency_test_file=$1
|
||||
|
||||
[[ -f "$json_file" ]] || return 0
|
||||
|
||||
if [[ "$metric_name" == "ttft" ]]; then
|
||||
jq -r '
|
||||
[
|
||||
.ttft_ms.p99?,
|
||||
.metrics.ttft_ms.p99?,
|
||||
.ttft.p99?,
|
||||
.metrics.ttft.p99?,
|
||||
.p99_ttft_ms?,
|
||||
.ttft_ms.mean?,
|
||||
.metrics.ttft_ms.mean?,
|
||||
.ttft.mean?,
|
||||
.metrics.ttft.mean?,
|
||||
.mean_ttft_ms?
|
||||
] | map(select(. != null)) | .[0] // empty
|
||||
' "$json_file"
|
||||
else
|
||||
jq -r '
|
||||
[
|
||||
.tpot_ms.p99?,
|
||||
.metrics.tpot_ms.p99?,
|
||||
.tpot.p99?,
|
||||
.metrics.tpot.p99?,
|
||||
.p99_tpot_ms?,
|
||||
.itl_ms.p99?,
|
||||
.metrics.itl_ms.p99?,
|
||||
.inter_token_latency_ms.p99?,
|
||||
.tpot_ms.mean?,
|
||||
.metrics.tpot_ms.mean?,
|
||||
.tpot.mean?,
|
||||
.metrics.tpot.mean?,
|
||||
.itl_ms.mean?,
|
||||
.metrics.itl_ms.mean?,
|
||||
.mean_tpot_ms?,
|
||||
.mean_itl_ms?
|
||||
] | map(select(. != null)) | .[0] // empty
|
||||
' "$json_file"
|
||||
fi
|
||||
}
|
||||
|
||||
evaluate_sla_from_json() {
|
||||
local json_file=$1
|
||||
local ttft
|
||||
local tpot
|
||||
local pass
|
||||
|
||||
[[ -f "$json_file" ]] || return 2
|
||||
|
||||
ttft=$(extract_metric_ms ttft "$json_file")
|
||||
tpot=$(extract_metric_ms tpot "$json_file")
|
||||
|
||||
[[ -n "$ttft" && -n "$tpot" ]] || return 2
|
||||
|
||||
pass=$(jq -n \
|
||||
--argjson ttft "$ttft" \
|
||||
--argjson tpot "$tpot" \
|
||||
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
|
||||
|
||||
[[ "$pass" == "true" ]]
|
||||
}
|
||||
|
||||
write_adaptive_summary_json() {
|
||||
local summary_file=$1
|
||||
local test_name=$2
|
||||
local qps=$3
|
||||
local static_last_pass=$4
|
||||
local static_first_fail=$5
|
||||
local final_last_pass=$6
|
||||
local final_first_fail=$7
|
||||
|
||||
jq -n \
|
||||
--arg test_name "$test_name" \
|
||||
--arg qps "$qps" \
|
||||
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||
--arg static_last_pass "${static_last_pass:-}" \
|
||||
--arg static_first_fail "${static_first_fail:-}" \
|
||||
--arg final_last_pass "${final_last_pass:-}" \
|
||||
--arg final_first_fail "${final_first_fail:-}" \
|
||||
'{
|
||||
test_name: $test_name,
|
||||
qps: $qps,
|
||||
sla_ttft_ms: $sla_ttft,
|
||||
sla_tpot_ms: $sla_tpot,
|
||||
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
|
||||
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
|
||||
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
|
||||
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
|
||||
}' > "$summary_file"
|
||||
}
|
||||
|
||||
run_single_serving_probe() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency=$3
|
||||
local tp=$4
|
||||
local compilation_config_mode=$5
|
||||
local optimization_level=$6
|
||||
local client_args_effective=$7
|
||||
local client_remote_args=$8
|
||||
local server_command=$9
|
||||
|
||||
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||
local result_json
|
||||
local num_prompts_arg=""
|
||||
local client_command
|
||||
|
||||
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
|
||||
|
||||
if [[ -f "$result_json" ]]; then
|
||||
evaluate_sla_from_json "$result_json"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||
num_prompts_arg="--num-prompts $num_prompts"
|
||||
fi
|
||||
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
$num_prompts_arg \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
|
||||
$client_args_effective $client_remote_args "
|
||||
|
||||
echo "Adaptive probe: $client_command"
|
||||
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
bash -c "$client_command"
|
||||
fi
|
||||
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
adaptive_search: true
|
||||
}')
|
||||
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
evaluate_sla_from_json "$result_json"
|
||||
}
|
||||
|
||||
adaptive_refine_from_static_results() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency_list_raw=$3
|
||||
local tp=$4
|
||||
local compilation_config_mode=$5
|
||||
local optimization_level=$6
|
||||
local client_args_effective=$7
|
||||
local client_remote_args=$8
|
||||
local server_command=$9
|
||||
|
||||
local sorted_points
|
||||
local point
|
||||
local rc
|
||||
local static_last_pass=""
|
||||
local static_first_fail=""
|
||||
local largest_static=""
|
||||
local step_hint=1
|
||||
local previous_point=""
|
||||
local low
|
||||
local high
|
||||
local mid
|
||||
local probes=0
|
||||
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
|
||||
|
||||
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
|
||||
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
|
||||
|
||||
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
|
||||
[[ -n "$sorted_points" ]] || return 0
|
||||
|
||||
while read -r point; do
|
||||
[[ -z "$point" ]] && continue
|
||||
largest_static="$point"
|
||||
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
static_last_pass="$point"
|
||||
elif (( rc == 1 )); then
|
||||
if [[ -n "$static_last_pass" ]]; then
|
||||
static_first_fail="$point"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "$previous_point" ]]; then
|
||||
step_hint=$(( point - previous_point ))
|
||||
if (( step_hint < 1 )); then step_hint=1; fi
|
||||
fi
|
||||
previous_point="$point"
|
||||
done <<< "$sorted_points"
|
||||
|
||||
if [[ -z "$static_last_pass" ]]; then
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -n "$static_first_fail" ]]; then
|
||||
low=$static_last_pass
|
||||
high=$static_first_fail
|
||||
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
mid=$(( (low + high) / 2 ))
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$mid" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$mid
|
||||
elif (( rc == 1 )); then
|
||||
high=$mid
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
|
||||
return 0
|
||||
fi
|
||||
|
||||
low=$largest_static
|
||||
high=""
|
||||
while (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
point=$(( low + step_hint ))
|
||||
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
|
||||
point=$ADAPTIVE_MAX_CONCURRENCY
|
||||
fi
|
||||
(( point > low )) || break
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$point" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$point
|
||||
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
|
||||
step_hint=$(( step_hint * 2 ))
|
||||
if (( step_hint < 1 )); then step_hint=1; fi
|
||||
elif (( rc == 1 )); then
|
||||
high=$point
|
||||
break
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -n "$high" ]]; then
|
||||
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
mid=$(( (low + high) / 2 ))
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$mid" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$mid
|
||||
elif (( rc == 1 )); then
|
||||
high=$mid
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
|
||||
}
|
||||
|
||||
run_benchmark_tests() {
|
||||
# run benchmark tests using `vllm bench <test_type>` command
|
||||
# $1: test type (latency or throughput)
|
||||
# $2: a json file specifying test cases
|
||||
|
||||
local test_type=$1
|
||||
local test_file=$2
|
||||
|
||||
# Iterate over tests
|
||||
jq -c '.[]' "$test_file" | while read -r params; do
|
||||
# Iterate over latency tests
|
||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
|
||||
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
|
||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -512,15 +200,15 @@ run_benchmark_tests() {
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
bench_params=$(echo "$params" | jq -r '.parameters')
|
||||
bench_args=$(json2args "$bench_params")
|
||||
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
bench_envs=$(json2envs "$bench_environment_variables")
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
latency_envs=$(json2envs "$latency_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@@ -533,42 +221,118 @@ run_benchmark_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
bench_command=" $bench_envs vllm bench $test_type \
|
||||
latency_command=" $latency_envs vllm bench latency \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$bench_args"
|
||||
$latency_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "${test_type^} command: $bench_command"
|
||||
echo "Latency command: $latency_command"
|
||||
|
||||
# recording benchmarking command and GPU command
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$bench_command" \
|
||||
--arg latency "$latency_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg test_type "$test_type" \
|
||||
'{
|
||||
($test_type + "_command"): $command,
|
||||
latency_command: $latency,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$bench_command"
|
||||
eval "$latency_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
|
||||
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
|
||||
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `vllm bench throughput`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
merge_serving_tests_stream() {
|
||||
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
|
||||
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
|
||||
local serving_test_file="$1"
|
||||
# shellcheck disable=SC2016
|
||||
local merged='
|
||||
local throughput_test_file
|
||||
throughput_test_file=$1
|
||||
|
||||
# Iterate over throughput tests
|
||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs vllm bench throughput \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Throughput command: $throughput_command"
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$throughput_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
throughput_command: $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$throughput_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '
|
||||
if type == "array" then
|
||||
# Plain format: test cases array
|
||||
.[]
|
||||
@@ -590,50 +354,7 @@ merge_serving_tests_stream() {
|
||||
else
|
||||
error("Unsupported serving test file format: must be array or object with .tests")
|
||||
end
|
||||
'
|
||||
|
||||
jq -c "$merged" "$serving_test_file" | \
|
||||
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
|
||||
select((($model|length)==0)
|
||||
or ((.server_parameters.model // "") == $model)
|
||||
or ((.client_parameters.model // "") == $model))
|
||||
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
|
||||
'
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# In dry-run mode, if filters are provided but no tests match, fail fast.
|
||||
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||
local count
|
||||
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
|
||||
if [[ "$count" -eq 0 ]]; then
|
||||
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Iterate over serving tests (merged + optional filtered stream)
|
||||
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
|
||||
' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
@@ -652,48 +373,10 @@ run_serving_tests() {
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
|
||||
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
|
||||
server_model=$(echo "$server_params" | jq -r '.model // empty')
|
||||
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
|
||||
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
|
||||
exit 1
|
||||
fi
|
||||
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
|
||||
server_args=$(json2args "$server_params_no_model")
|
||||
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Option 1: Dynamic num-prompts scaling based on max_concurrency
|
||||
#
|
||||
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
|
||||
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
|
||||
#
|
||||
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
|
||||
# unchanged (i.e., whatever is in serving-tests-*.json).
|
||||
# ------------------------------------------------------------
|
||||
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
|
||||
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
|
||||
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
|
||||
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||
# Handles: --num-prompts 123 and --num-prompts=123
|
||||
client_args_no_np="$(
|
||||
printf ' %s ' "$client_args" \
|
||||
| sed -E \
|
||||
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
|
||||
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
|
||||
)"
|
||||
# normalize whitespace
|
||||
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
|
||||
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
|
||||
client_args_effective="$client_args_no_np"
|
||||
else
|
||||
client_args_effective="$client_args"
|
||||
fi
|
||||
# qps_list
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
@@ -710,8 +393,8 @@ run_serving_tests() {
|
||||
|
||||
# check if there is enough resources to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@@ -725,13 +408,14 @@ run_serving_tests() {
|
||||
fi
|
||||
|
||||
# check if server model and client model is aligned
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
client_model=$(echo "$client_params" | jq -r '.model')
|
||||
if [[ $server_model != "$client_model" ]]; then
|
||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="$server_envs vllm serve $server_model \
|
||||
server_command="$server_envs vllm serve \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
@@ -739,7 +423,7 @@ run_serving_tests() {
|
||||
echo "Server command: $server_command"
|
||||
# support remote vllm server
|
||||
client_remote_args=""
|
||||
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
|
||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
# wait until the server is alive
|
||||
@@ -750,9 +434,6 @@ run_serving_tests() {
|
||||
echo ""
|
||||
echo "vLLM failed to start within the timeout period."
|
||||
fi
|
||||
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||
# dry-run: don't start server
|
||||
echo "Dry Run."
|
||||
else
|
||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||
if [[ ${REMOTE_PORT} ]]; then
|
||||
@@ -762,48 +443,34 @@ run_serving_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
# save the compilation mode and optimization level on the serving results
|
||||
# whenever they are set
|
||||
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
|
||||
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
# iterate over different max_concurrency
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
echo " new test name $new_test_name"
|
||||
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
|
||||
num_prompts_arg=""
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||
num_prompts_arg="--num-prompts $num_prompts"
|
||||
fi
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
$num_prompts_arg \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||
$client_args_effective $client_remote_args "
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
bash -c "$client_command"
|
||||
fi
|
||||
bash -c "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
@@ -818,39 +485,25 @@ run_serving_tests() {
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
adaptive_refine_from_static_results \
|
||||
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
done
|
||||
|
||||
# clean up
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
kill -9 "$server_pid"
|
||||
kill_gpu_processes
|
||||
fi
|
||||
kill -9 $server_pid
|
||||
kill_gpu_processes
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
local ARCH
|
||||
ARCH=''
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
check_cpus
|
||||
ARCH="-$gpu_type"
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
check_cpus
|
||||
ARCH='-cpu'
|
||||
else
|
||||
check_gpus
|
||||
ARCH="$arch_suffix"
|
||||
fi
|
||||
|
||||
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
check_hf_token
|
||||
else
|
||||
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
@@ -871,24 +524,17 @@ main() {
|
||||
|
||||
# dump vllm info via vllm collect-env
|
||||
env_output=$(vllm collect-env)
|
||||
|
||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
|
||||
|
||||
if [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
# postprocess benchmarking results
|
||||
pip install tabulate pandas
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
|
||||
|
||||
upload_to_buildkite
|
||||
}
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -51,56 +51,5 @@
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_deepseek_r1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"dtype": "bfloat16"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"max-model-len": 512,
|
||||
"max-num-seqs": 128,
|
||||
"async-scheduling": "",
|
||||
"gpu-memory-utilization": 0.95,
|
||||
"enable_expert_parallel": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_qwen3_8b",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 128,
|
||||
"dtype": "bfloat16",
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [
|
||||
12,
|
||||
16,
|
||||
24,
|
||||
32,
|
||||
64,
|
||||
128,
|
||||
200
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
|
||||
},
|
||||
"server_parameters": {
|
||||
"dtype": "bfloat16",
|
||||
"model": "openai/whisper-large-v3-turbo"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "openai/whisper-large-v3-turbo",
|
||||
"backend": "openai-audio",
|
||||
"endpoint": "/v1/audio/transcriptions",
|
||||
"dataset_name": "hf",
|
||||
"dataset_path": "openslr/librispeech_asr",
|
||||
"hf_subset": "clean",
|
||||
"hf_split": "test",
|
||||
"no_stream": "",
|
||||
"no_oversample": "",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,41 +0,0 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [
|
||||
32,
|
||||
64,
|
||||
128
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"dtype": "bfloat16",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"trust_remote_code": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"backend": "openai-embeddings",
|
||||
"endpoint": "/v1/embeddings",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,356 +0,0 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -19,14 +19,15 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -73,6 +74,17 @@
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
@@ -95,6 +107,17 @@
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
@@ -118,25 +141,105 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -21,7 +22,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -37,6 +37,7 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -48,7 +49,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -64,6 +64,7 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -75,88 +76,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_deepseek_r1",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 200,
|
||||
"async-scheduling": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"disable_log_stats": "",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 128,
|
||||
"async-scheduling": "",
|
||||
"enable_expert_parallel": "",
|
||||
"max-num-batched-tokens": 4096
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen3_8b",
|
||||
"qps_list": [1, 4, 10, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"disable_log_stats": "",
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -13,7 +14,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -23,6 +23,7 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -31,7 +32,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -41,6 +41,7 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -49,7 +50,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
@@ -59,6 +59,7 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
@@ -70,7 +71,6 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"temperature": 0,
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -57,67 +57,5 @@
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_deepseek_r1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 384,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": "",
|
||||
"enable_expert_parallel": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_qwen3_8b",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"max-num-seqs": 512,
|
||||
"backend": "vllm",
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -1,694 +1,194 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image (arm64)"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
- label: "Create multi-arch manifest"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- group: "Build Python wheels"
|
||||
key: "build-wheels"
|
||||
steps:
|
||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - aarch64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Generate and upload wheel indices"
|
||||
depends_on: "build-wheels"
|
||||
allow_dependency_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/generate-and-upload-nightly-index.sh"
|
||||
|
||||
- group: "Build release Docker images"
|
||||
key: "build-release-images"
|
||||
steps:
|
||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-ubuntu2404
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-ubuntu2404
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
|
||||
|
||||
- block: "Build release image for x86_64 CPU"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - CPU"
|
||||
depends_on:
|
||||
- block-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image for arm64 CPU"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - arm64 - CPU"
|
||||
depends_on:
|
||||
- block-arm64-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- group: "Publish release images"
|
||||
key: "publish-release-images"
|
||||
steps:
|
||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow - CUDA 12.9"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on:
|
||||
- build-release-image-x86-ubuntu2404
|
||||
- build-release-image-arm64-ubuntu2404
|
||||
id: create-multi-arch-manifest-ubuntu2404
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0-ubuntu2404
|
||||
- build-release-image-arm64-cuda-13-0-ubuntu2404
|
||||
id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- group: "Publish wheels"
|
||||
key: "publish-wheels"
|
||||
steps:
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- build-wheels
|
||||
|
||||
- label: "Upload release wheels to PyPI"
|
||||
depends_on:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
# =============================================================================
|
||||
#
|
||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
||||
# To build a specific version, trigger the build from that branch/tag.
|
||||
#
|
||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||
- label: ":rocm: Build ROCm Base Image & Wheels"
|
||||
id: build-rocm-base-wheels
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Generate cache key
|
||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
ECR_CACHE_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:$${CACHE_KEY}-rocm-base"
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Base Build Configuration"
|
||||
echo "========================================"
|
||||
echo " CACHE_KEY: $${CACHE_KEY}"
|
||||
echo " ECR_CACHE_TAG: $${ECR_CACHE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
IMAGE_EXISTS=false
|
||||
WHEELS_EXIST=false
|
||||
|
||||
# Check ECR for Docker image
|
||||
|
||||
if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then
|
||||
IMAGE_EXISTS=true
|
||||
echo "ECR image cache HIT"
|
||||
fi
|
||||
|
||||
# Check S3 for wheels
|
||||
WHEEL_CACHE_STATUS=$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
if [ "$${WHEEL_CACHE_STATUS}" = "hit" ]; then
|
||||
WHEELS_EXIST=true
|
||||
echo "S3 wheels cache HIT"
|
||||
fi
|
||||
|
||||
|
||||
# Scenario 1: Both cached (best case)
|
||||
if [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "true" ]; then
|
||||
echo ""
|
||||
echo "FULL CACHE HIT - Reusing both image and wheels"
|
||||
echo ""
|
||||
|
||||
# Download wheels
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||
|
||||
# Save ECR tag for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
|
||||
|
||||
# Scenario 2: Full rebuild needed
|
||||
else
|
||||
echo ""
|
||||
echo " CACHE MISS - Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image and push to ECR
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag "$${ECR_CACHE_TAG}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--push \
|
||||
.
|
||||
|
||||
# Build wheel extraction stage
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||
--target debs_wheel_release \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract and upload wheels
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${cid}
|
||||
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
# Cache base docker image to ECR
|
||||
docker push "$${ECR_CACHE_TAG}"
|
||||
|
||||
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
|
||||
|
||||
echo ""
|
||||
echo " Build complete - Image and wheels cached"
|
||||
fi
|
||||
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-base-wheels/*.whl"
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||
- label: ":python: Build vLLM ROCm Wheel - x86_64"
|
||||
id: build-rocm-vllm-wheel
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
- block: "Build arm64 CPU release image"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish arm64 CPU release image"
|
||||
depends_on: block-arm64-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 180
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# Download artifacts and prepare Docker image
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
||||
# This fixes version detection when tags are moved/force-pushed
|
||||
echo "Fetching latest tags from origin..."
|
||||
git fetch --tags --force origin
|
||||
|
||||
# Log tag information for debugging version detection
|
||||
echo "========================================"
|
||||
echo "Git Tag Verification"
|
||||
echo "========================================"
|
||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
||||
echo ""
|
||||
echo "Recent tags (pointing to commits near HEAD):"
|
||||
git tag -l --sort=-creatordate | head -5
|
||||
echo "setuptools_scm version detection:"
|
||||
pip install -q setuptools_scm 2>/dev/null || true
|
||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
||||
echo "========================================"
|
||||
|
||||
# Download wheel artifacts from current build
|
||||
echo "Downloading wheel artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
|
||||
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
|
||||
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
|
||||
if [ -z "$${ECR_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: rocm-base-image-tag metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Pull base Docker image from ECR
|
||||
docker pull "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "Loaded base image: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Prepare base wheels for Docker build context
|
||||
mkdir -p docker/context/base-wheels
|
||||
touch docker/context/base-wheels/.keep
|
||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
||||
echo "Base wheels for vLLM build:"
|
||||
ls -lh docker/context/base-wheels/
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM wheel with:"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--file docker/Dockerfile.rocm \
|
||||
--target export_vllm_wheel_release \
|
||||
--output type=local,dest=rocm-dist \
|
||||
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
|
||||
--build-arg REMOTE_VLLM=0 \
|
||||
--build-arg GIT_REPO_CHECK=1 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
.
|
||||
echo "Built vLLM wheel:"
|
||||
ls -lh rocm-dist/*.whl
|
||||
# Copy wheel to artifacts directory
|
||||
mkdir -p artifacts/rocm-vllm-wheel
|
||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||
echo "Final vLLM wheel:"
|
||||
ls -lh artifacts/rocm-vllm-wheel/
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 3: Upload Wheels to S3
|
||||
- label: ":s3: Upload ROCm Wheels to S3"
|
||||
id: upload-rocm-wheels
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- step: build-rocm-vllm-wheel
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
# Download all wheel artifacts and run upload
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Download artifacts from current build
|
||||
echo "Downloading artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
||||
|
||||
# Run upload script
|
||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
||||
- label: ":memo: Annotate ROCm wheel release"
|
||||
id: annotate-rocm-release
|
||||
depends_on:
|
||||
- upload-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
|
||||
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
|
||||
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
|
||||
- block: "Generate Root Index for ROCm Wheels for Release"
|
||||
key: block-generate-root-index-rocm-wheels
|
||||
depends_on: upload-rocm-wheels
|
||||
|
||||
- label: ":package: Generate Root Index for ROCm Wheels for Release"
|
||||
depends_on: block-generate-root-index-rocm-wheels
|
||||
id: generate-root-index-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
VARIANT: "rocm721"
|
||||
|
||||
# ROCm Job 6: Build ROCm Release Docker Image
|
||||
- label: ":docker: Build release image - x86_64 - ROCm"
|
||||
id: build-rocm-release-image
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
|
||||
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
|
||||
if [ -z "$${ECR_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: rocm-base-image-tag metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Pull base Docker image from ECR
|
||||
docker pull "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "Loaded base image: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Pass the base image ECR tag to downstream steps (nightly publish)
|
||||
buildkite-agent meta-data set "rocm-base-ecr-tag" "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM ROCm release image with:"
|
||||
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM ROCm release image using cached base
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
|
||||
--target vllm-openai \
|
||||
--progress plain \
|
||||
-f docker/Dockerfile.rocm .
|
||||
|
||||
# Push to ECR
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
echo ""
|
||||
echo " Successfully built and pushed ROCm release image"
|
||||
echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||
echo ""
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
- label: "Publish nightly ROCm image to DockerHub"
|
||||
depends_on:
|
||||
- build-rocm-release-image
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds-rocm.sh"
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- vllm/vllm-openai-rocm"
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh base-nightly- vllm/vllm-openai-rocm"
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
|
||||
@@ -8,41 +8,30 @@ if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel (by commit):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
(Optional) For CUDA 13.0:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
|
||||
(Optional) For CPU:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download the wheel (by version):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
# Download images:
|
||||
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
# Tag and push images:
|
||||
|
||||
## CUDA
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
@@ -50,70 +39,16 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
|
||||
## ROCm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
## CPU
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
# Create multi-arch manifest:
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
set -ex
|
||||
|
||||
# Extract build configuration from Dockerfile.rocm_base (single source of truth)
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
|
||||
PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
|
||||
|
||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
||||
|
||||
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
||||
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## ROCm Wheel and Docker Image Releases
|
||||
### Build Configuration
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| **ROCm Version** | ${ROCM_VERSION} |
|
||||
| **Python Version** | ${PYTHON_VERSION} |
|
||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
||||
|
||||
### :package: Installation
|
||||
|
||||
**Install from this build (by commit):**
|
||||
|
||||
\`\`\`bash
|
||||
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
|
||||
# Example for ROCm ${ROCM_VERSION}:
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
**Install from nightly (if published):**
|
||||
|
||||
\`\`\`bash
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
### :floppy_disk: Download Wheels Directly
|
||||
|
||||
\`\`\`bash
|
||||
# List all ROCm wheels
|
||||
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
|
||||
# Download specific wheels
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
||||
\`\`\`
|
||||
|
||||
### :gear: Included Packages
|
||||
- **vllm**: vLLM with ROCm support
|
||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||
- **triton**: Triton
|
||||
- **triton-kernels**: Triton kernels
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **torchaudio**: Torchaudio for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
- **amd_aiter**: Aiter for ROCm
|
||||
- **flash-attn**: Flash Attention for ROCm
|
||||
|
||||
### :warning: Notes
|
||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||
- Platform: Linux x86_64 only
|
||||
|
||||
### :package: Docker Image Release
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
|
||||
EOF
|
||||
@@ -1,131 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Cache helper for ROCm base wheels
|
||||
#
|
||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
||||
#
|
||||
# Usage:
|
||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
||||
#
|
||||
# Environment variables:
|
||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
||||
CACHE_PREFIX="rocm/cache"
|
||||
|
||||
# Generate hash from Dockerfile content + build args
|
||||
generate_cache_key() {
|
||||
# Include Dockerfile content
|
||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||
|
||||
echo "${dockerfile_hash}"
|
||||
}
|
||||
|
||||
CACHE_KEY=$(generate_cache_key)
|
||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
||||
|
||||
case "${1:-}" in
|
||||
check)
|
||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||
echo "Cache path: ${CACHE_PATH}" >&2
|
||||
|
||||
# Check if cache exists by listing objects
|
||||
# We look for at least one .whl file
|
||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
||||
echo "S3 ls output:" >&2
|
||||
echo "$S3_OUTPUT" | head -5 >&2
|
||||
|
||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
||||
echo "hit"
|
||||
else
|
||||
echo "miss"
|
||||
fi
|
||||
;;
|
||||
|
||||
upload)
|
||||
echo "========================================"
|
||||
echo "Uploading wheels to cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Uploading $WHEEL_COUNT wheels..."
|
||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
||||
|
||||
echo ""
|
||||
echo "Cache upload complete!"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
download)
|
||||
echo "========================================"
|
||||
echo "Downloading wheels from cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
|
||||
# Use sync with include/exclude to only download .whl files
|
||||
aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \
|
||||
--exclude "*" \
|
||||
--include "*.whl"
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
|
||||
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
key)
|
||||
echo "${CACHE_KEY}"
|
||||
;;
|
||||
|
||||
path)
|
||||
echo "${CACHE_PATH}"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
||||
echo "" >&2
|
||||
echo "Commands:" >&2
|
||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
||||
echo " key - Output the cache key" >&2
|
||||
echo " path - Output the full S3 cache path" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -1,235 +0,0 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Check if Ray LLM can generate lock files that are compatible with this
|
||||
# version of vllm. Downloads Ray's requirement files and runs a full
|
||||
# dependency resolution with the installed vllm's constraints to see if
|
||||
# a valid lock file can be produced.
|
||||
#
|
||||
# See: https://github.com/vllm-project/vllm/issues/33599
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
|
||||
|
||||
WORK_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||
|
||||
# ── Detect PyTorch index URL ─────────────────────────────────────────────
|
||||
|
||||
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
|
||||
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
|
||||
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
|
||||
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
|
||||
TORCH_INDEX_URL="${CANDIDATE_URL}"
|
||||
else
|
||||
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
|
||||
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
|
||||
TORCH_INDEX_URL=""
|
||||
fi
|
||||
else
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
|
||||
fi
|
||||
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
|
||||
|
||||
# Fetch all Ray requirement files used in the LLM depset pipeline
|
||||
echo ">>> Fetching Ray requirement files"
|
||||
RAY_FILES=(
|
||||
"requirements.txt"
|
||||
"requirements/cloud-requirements.txt"
|
||||
"requirements/base-test-requirements.txt"
|
||||
"requirements/llm/llm-requirements.txt"
|
||||
"requirements/llm/llm-test-requirements.txt"
|
||||
)
|
||||
for FILE in "${RAY_FILES[@]}"; do
|
||||
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
|
||||
echo " ${FILE}"
|
||||
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
|
||||
done
|
||||
|
||||
# Extract installed vllm deps
|
||||
echo ">>> Extracting installed vllm dependency constraints"
|
||||
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
|
||||
"""Write out the installed vllm's dependencies as pip constraint lines.
|
||||
|
||||
Ray uses vllm[audio], so audio-extra deps are included with their extra
|
||||
markers stripped. The resolver cannot evaluate extra markers for a
|
||||
package that is not itself being resolved from an index, so we activate
|
||||
them manually here.
|
||||
"""
|
||||
import importlib.metadata
|
||||
import re
|
||||
import sys
|
||||
|
||||
out_path = sys.argv[1]
|
||||
raw_reqs = importlib.metadata.requires("vllm") or []
|
||||
|
||||
# Ray uses vllm[audio] – activate that extra.
|
||||
ACTIVE_EXTRAS = {"audio"}
|
||||
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
|
||||
|
||||
lines = []
|
||||
for r in raw_reqs:
|
||||
if ";" not in r:
|
||||
# Unconditional dep — always include.
|
||||
lines.append(r.strip())
|
||||
continue
|
||||
|
||||
req_part, _, marker_part = r.partition(";")
|
||||
marker_part = marker_part.strip()
|
||||
|
||||
extra_matches = EXTRA_RE.findall(marker_part)
|
||||
if not extra_matches:
|
||||
# Non-extra marker (python_version, etc.) — keep as-is.
|
||||
lines.append(r.strip())
|
||||
continue
|
||||
|
||||
if not ACTIVE_EXTRAS.intersection(extra_matches):
|
||||
continue # Skip inactive extras (tensorizer, bench, …).
|
||||
|
||||
# Strip the extra== conditions but keep any remaining markers
|
||||
# (e.g. python_version).
|
||||
cleaned = EXTRA_RE.sub("", marker_part)
|
||||
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
|
||||
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
|
||||
|
||||
if cleaned:
|
||||
lines.append(f"{req_part.strip()} ; {cleaned}")
|
||||
else:
|
||||
lines.append(req_part.strip())
|
||||
|
||||
with open(out_path, "w") as f:
|
||||
for line in lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
print(f"Wrote {len(lines)} constraints to {out_path}")
|
||||
PYEOF
|
||||
|
||||
echo ">>> Installed vllm deps (first 20 lines):"
|
||||
head -20 "${WORK_DIR}/vllm-constraints.txt"
|
||||
|
||||
# Remove Ray's vllm pin — the installed vllm's transitive deps
|
||||
# (written above) replace it in the resolution. vllm itself cannot
|
||||
# be resolved from PyPI for in-development versions, so we test
|
||||
# whether Ray's requirements can coexist with vllm's dependency
|
||||
# constraints instead.
|
||||
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
|
||||
|
||||
# Install uv if needed
|
||||
if ! command -v uv &>/dev/null; then
|
||||
echo ">>> Installing uv"
|
||||
pip install uv -q
|
||||
fi
|
||||
|
||||
# Resolve: given vllm's constraints, can Ray compile a lock file?
|
||||
#
|
||||
# vllm's dependency constraints are the fixed side — Ray is flexible and
|
||||
# can regenerate its lock files. We pass vllm's constraints via -c so
|
||||
# the resolver treats them as non-negotiable bounds, then check whether
|
||||
# Ray's own requirements can still be satisfied within those bounds.
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo ">>> Resolving: Can Ray generate compatible lock files?"
|
||||
echo "============================================================"
|
||||
|
||||
EXTRA_INDEX_ARGS=()
|
||||
if [[ -n "${TORCH_INDEX_URL}" ]]; then
|
||||
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
|
||||
fi
|
||||
|
||||
set +e
|
||||
uv pip compile \
|
||||
"${WORK_DIR}/requirements.txt" \
|
||||
"${WORK_DIR}/cloud-requirements.txt" \
|
||||
"${WORK_DIR}/base-test-requirements.txt" \
|
||||
"${WORK_DIR}/llm-requirements.txt" \
|
||||
"${WORK_DIR}/llm-test-requirements.txt" \
|
||||
-c "${WORK_DIR}/vllm-constraints.txt" \
|
||||
--python-version 3.12 \
|
||||
--python-platform x86_64-manylinux_2_31 \
|
||||
"${EXTRA_INDEX_ARGS[@]}" \
|
||||
--index-strategy unsafe-best-match \
|
||||
--unsafe-package setuptools \
|
||||
--unsafe-package ray \
|
||||
--no-header \
|
||||
-o "${WORK_DIR}/resolved.txt" \
|
||||
2>&1
|
||||
EXIT_CODE=$?
|
||||
set -e
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
if [ $EXIT_CODE -eq 0 ]; then
|
||||
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
|
||||
echo ""
|
||||
echo "Key resolved versions:"
|
||||
grep -E '^(protobuf|torch|numpy|transformers)==' \
|
||||
"${WORK_DIR}/resolved.txt" | sort || true
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
|
||||
echo "This means a fundamental dependency conflict exists that Ray"
|
||||
echo "cannot resolve by regenerating its lock files."
|
||||
echo "See: https://github.com/vllm-project/vllm/issues/33599"
|
||||
echo "=========================================="
|
||||
|
||||
# Buildkite annotation
|
||||
if [ -f /usr/bin/buildkite-agent ]; then
|
||||
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
|
||||
### :warning: Ray Dependency Compatibility Warning
|
||||
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
|
||||
Ray would not be able to regenerate its lock files to accommodate this vllm version.
|
||||
|
||||
Please check the **Ray Dependency Compatibility Check** step logs for details.
|
||||
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Notify Slack if webhook is configured and PR/branch are valid.
|
||||
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
||||
PR="${BUILDKITE_PULL_REQUEST:-}"
|
||||
BRANCH="${BUILDKITE_BRANCH:-}"
|
||||
|
||||
# Skip notification if PR is invalid or branch is empty
|
||||
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
|
||||
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
|
||||
else
|
||||
echo ">>> Sending Slack notification"
|
||||
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
||||
# shellcheck disable=SC2016
|
||||
PAYLOAD=$(python3 -c '
|
||||
import json, os, sys
|
||||
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
|
||||
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
|
||||
url = os.getenv("BUILDKITE_BUILD_URL", "#")
|
||||
data = {
|
||||
"text": ":warning: Ray Dependency Compatibility Check Failed",
|
||||
"blocks": [{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": (
|
||||
"*:warning: Ray Dependency Compatibility Check Failed*\n"
|
||||
f"PR #{pr} on branch `{branch}` introduces dependencies "
|
||||
f"that cannot be resolved with Ray'\''s requirements.\n"
|
||||
f"<{url}|View Build>"
|
||||
),
|
||||
},
|
||||
}],
|
||||
}
|
||||
print(json.dumps(data))
|
||||
')
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
|
||||
-H 'Content-type: application/json' \
|
||||
-d "$PAYLOAD")
|
||||
echo " Slack webhook response: $HTTP_CODE"
|
||||
fi
|
||||
else
|
||||
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
||||
fi
|
||||
|
||||
exit 1
|
||||
@@ -1,242 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# cherry-pick-from-milestone.sh
|
||||
# Find commits from a GitHub milestone that are missing from the current branch
|
||||
# and output them in chronological order for cherry-picking.
|
||||
#
|
||||
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <milestone> [options]
|
||||
|
||||
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
|
||||
|
||||
Arguments:
|
||||
milestone The GitHub milestone name (e.g., v0.14.0)
|
||||
|
||||
Options:
|
||||
--dry-run Show the cherry-pick commands without executing (default)
|
||||
--execute Actually execute the cherry-picks
|
||||
--main-branch Specify the main branch name (default: main)
|
||||
--help Show this help message
|
||||
|
||||
Examples:
|
||||
$(basename "$0") v0.14.0
|
||||
$(basename "$0") v0.14.0 --dry-run
|
||||
$(basename "$0") v0.14.0 --execute
|
||||
$(basename "$0") v0.14.0 --main-branch master
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Default values
|
||||
MILESTONE=""
|
||||
DRY_RUN=true
|
||||
MAIN_BRANCH="main"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--main-branch)
|
||||
MAIN_BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
MILESTONE="$1"
|
||||
else
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate milestone argument
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
log_error "Milestone is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if gh CLI is available
|
||||
if ! command -v gh &>/dev/null; then
|
||||
log_error "GitHub CLI (gh) is not installed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if authenticated with gh
|
||||
if ! gh auth status &>/dev/null; then
|
||||
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_BRANCH=$(git branch --show-current)
|
||||
log_info "Current branch: ${CURRENT_BRANCH}"
|
||||
log_info "Main branch: ${MAIN_BRANCH}"
|
||||
log_info "Milestone: ${MILESTONE}"
|
||||
echo ""
|
||||
|
||||
# Fetch latest from remote
|
||||
log_info "Fetching latest from remote..."
|
||||
git fetch origin "$MAIN_BRANCH" --quiet
|
||||
|
||||
# Get merged PRs from the milestone, sorted by merge date
|
||||
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
||||
|
||||
# Store PR data in a temp file
|
||||
PR_DATA=$(mktemp)
|
||||
trap 'rm -f "$PR_DATA"' EXIT
|
||||
|
||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||
--limit 1000 \
|
||||
--json number,title,mergeCommit,mergedAt \
|
||||
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
|
||||
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
|
||||
log_error "This could be due to:"
|
||||
log_error " - Milestone does not exist"
|
||||
log_error " - Network/authentication issues"
|
||||
log_error " - Invalid milestone name format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -s "$PR_DATA" ]]; then
|
||||
log_warn "No merged PRs found for milestone '${MILESTONE}'"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TOTAL_PRS=$(wc -l < "$PR_DATA")
|
||||
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
|
||||
echo ""
|
||||
|
||||
# Find commits that are missing from current branch
|
||||
MISSING_COMMITS=()
|
||||
MISSING_INFO=()
|
||||
|
||||
while IFS=$'\t' read -r sha pr_number title; do
|
||||
# Skip if SHA is empty or null
|
||||
if [[ -z "$sha" || "$sha" == "null" ]]; then
|
||||
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check if this commit is already in the current branch
|
||||
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
|
||||
log_success "PR #${pr_number} already in branch: ${title:0:60}"
|
||||
else
|
||||
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
|
||||
MISSING_COMMITS+=("$sha")
|
||||
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
|
||||
fi
|
||||
done < "$PR_DATA"
|
||||
|
||||
echo ""
|
||||
|
||||
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
|
||||
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
|
||||
echo ""
|
||||
|
||||
# Output the cherry-pick commands
|
||||
echo "=========================================="
|
||||
echo "Cherry-pick commands (in chronological order):"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
for info in "${MISSING_INFO[@]}"; do
|
||||
echo "# $info"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "# Run these commands to cherry-pick all missing commits:"
|
||||
echo "git cherry-pick ${MISSING_COMMITS[*]}"
|
||||
echo ""
|
||||
|
||||
# Or one by one
|
||||
echo "# Or cherry-pick one at a time:"
|
||||
for sha in "${MISSING_COMMITS[@]}"; do
|
||||
echo "git cherry-pick $sha"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Execute if requested
|
||||
if [[ "$DRY_RUN" == false ]]; then
|
||||
echo "=========================================="
|
||||
log_info "Executing cherry-picks..."
|
||||
echo "=========================================="
|
||||
|
||||
for i in "${!MISSING_COMMITS[@]}"; do
|
||||
sha="${MISSING_COMMITS[$i]}"
|
||||
info="${MISSING_INFO[$i]}"
|
||||
|
||||
echo ""
|
||||
log_info "Cherry-picking: $info"
|
||||
|
||||
if git cherry-pick "$sha"; then
|
||||
log_success "Successfully cherry-picked $sha"
|
||||
else
|
||||
log_error "Failed to cherry-pick $sha"
|
||||
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
log_success "All cherry-picks completed successfully!"
|
||||
else
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
|
||||
echo "Run with --execute to perform the cherry-picks"
|
||||
echo "=========================================="
|
||||
fi
|
||||
@@ -3,20 +3,10 @@
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-"
|
||||
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
|
||||
# Get tag prefix and repo from arguments
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
REPO="${2:-vllm/vllm-openai}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"
|
||||
|
||||
# DockerHub API endpoint for the repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
|
||||
# Get DockerHub credentials from environment
|
||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||
@@ -55,7 +45,7 @@ get_all_tags() {
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
@@ -73,7 +63,7 @@ delete_tag() {
|
||||
local tag_name="$1"
|
||||
echo "Deleting tag: $tag_name"
|
||||
|
||||
local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name"
|
||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||
set +x
|
||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
||||
set -x
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Generate and upload wheel indices for all wheels in the commit directory.
|
||||
# This script should run once after all wheels have been built and uploaded.
|
||||
|
||||
# ======== setup ========
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
INDICES_OUTPUT_DIR="indices"
|
||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||
PYTHON="${PYTHON_PROG:-python3}" # try to read from env var, otherwise use python3
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# detect if python3.12+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# use new python from docker
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ======== generate and upload indices ========
|
||||
|
||||
# list all wheels in the commit directory
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# call script to generate indices for all existing wheels
|
||||
# these indices have relative paths that work as long as they are next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
alias_args=()
|
||||
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
|
||||
fi
|
||||
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
echo "Uploading indices to overwrite /nightly/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
# detect version from any wheel in the commit directory
|
||||
# download the first wheel we find to extract version metadata
|
||||
first_wheel_key=$($PYTHON -c "import json; obj=json.load(open('$obj_json')); print(next((c['Key'] for c in obj.get('Contents', []) if c['Key'].endswith('.whl')), ''))")
|
||||
if [[ -z "$first_wheel_key" ]]; then
|
||||
echo "Error: No wheels found in $S3_COMMIT_PREFIX"
|
||||
exit 1
|
||||
fi
|
||||
first_wheel=$(basename "$first_wheel_key")
|
||||
aws s3 cp "s3://$BUCKET/${first_wheel_key}" "/tmp/${first_wheel}"
|
||||
version=$(unzip -p "/tmp/${first_wheel}" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
rm -f "/tmp/${first_wheel}"
|
||||
echo "Version in wheel: $version"
|
||||
pure_version="${version%%+*}"
|
||||
echo "Pure version (without variant): $pure_version"
|
||||
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "${INDICES_OUTPUT_DIR:?}"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
@@ -16,18 +16,6 @@ from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""
|
||||
Normalize package name according to PEP 503.
|
||||
https://peps.python.org/pep-0503/#normalized-names
|
||||
|
||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
||||
and lowercase the result.
|
||||
"""
|
||||
return re.sub(r"[-_.]+", "-", name).lower()
|
||||
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
@@ -90,13 +78,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version_part, suffix = version.split("+", 1)
|
||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
||||
# Git hashes and other suffixes are NOT variants
|
||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
||||
variant = suffix
|
||||
version = version_part
|
||||
# Otherwise keep the full version string (variant stays None)
|
||||
version, variant = version.split("+")
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
@@ -112,7 +94,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant subdirectory.
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
@@ -168,23 +150,23 @@ def generate_index_and_metadata(
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant subdirectory
|
||||
cpu/ # cpu variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
@@ -194,7 +176,7 @@ def generate_index_and_metadata(
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant subdirectory
|
||||
cu130/ # cu130 variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
@@ -224,26 +206,6 @@ def generate_index_and_metadata(
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# For ROCm builds: inherit variant from vllm wheel
|
||||
# All ROCm wheels should share the same variant as vllm
|
||||
rocm_variant = None
|
||||
for file in parsed_files:
|
||||
if (
|
||||
file.package_name == "vllm"
|
||||
and file.variant
|
||||
and file.variant.startswith("rocm")
|
||||
):
|
||||
rocm_variant = file.variant
|
||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
||||
break
|
||||
|
||||
# Apply ROCm variant to all wheels without a variant
|
||||
if rocm_variant:
|
||||
for file in parsed_files:
|
||||
if file.variant is None:
|
||||
file.variant = rocm_variant
|
||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
@@ -294,8 +256,8 @@ def generate_index_and_metadata(
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant (normalized per PEP 503)
|
||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
@@ -307,10 +269,8 @@ def generate_index_and_metadata(
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only (compare normalized names)
|
||||
package_files = [
|
||||
f for f in files if normalize_package_name(f.package_name) == package
|
||||
]
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
@@ -331,7 +291,6 @@ if __name__ == "__main__":
|
||||
"""
|
||||
Arguments:
|
||||
--version <version> : version string for the current build (e.g., commit hash)
|
||||
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
@@ -359,12 +318,6 @@ if __name__ == "__main__":
|
||||
required=True,
|
||||
help="Directory to store generated index files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wheel-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Directory containing wheel files (default to be same as `version`)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alias-to-default",
|
||||
type=str,
|
||||
@@ -381,13 +334,8 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
||||
if "\\" in version:
|
||||
raise ValueError("Version string must not contain backslashes.")
|
||||
if "/" in version and not version.startswith("rocm/"):
|
||||
raise ValueError(
|
||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
||||
)
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
@@ -424,7 +372,7 @@ if __name__ == "__main__":
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# keep only "official" files for a non-nightly version (specified by cli args)
|
||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||
if PY_VERSION_RE.match(version):
|
||||
# upload-wheels.sh ensures no "dev" is in args.version
|
||||
@@ -436,25 +384,9 @@ if __name__ == "__main__":
|
||||
print("Nightly version detected, keeping all wheel files.")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
#
|
||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
||||
# - rocm/{commit}/ (same as wheels)
|
||||
# - rocm/nightly/
|
||||
# - rocm/{version}/
|
||||
# All these are under the "rocm/" prefix, so relative paths should be
|
||||
# relative to "rocm/", not the bucket root.
|
||||
if args.wheel_dir:
|
||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
||||
elif version.startswith("rocm/"):
|
||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
||||
wheel_dir = version.split("/", 1)[1]
|
||||
else:
|
||||
wheel_dir = version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
||||
wheel_base_dir = Path(output_dir).parent / version
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
|
||||
@@ -1,40 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs tests inside the corresponding ROCm docker container.
|
||||
# It handles both single-node and multi-node test configurations.
|
||||
#
|
||||
# Multi-node detection: Instead of matching on fragile group names, we detect
|
||||
# multi-node jobs structurally by looking for the bracket command syntax
|
||||
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
|
||||
#
|
||||
###############################################################################
|
||||
# QUOTING / COMMAND PASSING
|
||||
#
|
||||
# Passing commands as positional arguments ($*) is fragile when the command
|
||||
# string itself contains double quotes, e.g.:
|
||||
#
|
||||
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
|
||||
#
|
||||
# The outer shell resolves the nested quotes *before* this script runs, so
|
||||
# the script receives mangled input it cannot fully recover.
|
||||
#
|
||||
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
|
||||
#
|
||||
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
|
||||
# bash run-amd-test.sh
|
||||
#
|
||||
# Single-quoted assignment preserves all inner double quotes verbatim.
|
||||
# The $* path is kept for backward compatibility but callers should migrate.
|
||||
###############################################################################
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
###############################################################################
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
@@ -43,12 +28,15 @@ cleanup_docker() {
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
@@ -56,443 +44,193 @@ cleanup_docker() {
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
local max_nodes=${NUM_NODES:-2}
|
||||
for node in $(seq 0 $((max_nodes - 1))); do
|
||||
if docker ps -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}" || true
|
||||
fi
|
||||
done
|
||||
if docker network ls | grep -q docker-net; then
|
||||
docker network rm docker-net || true
|
||||
fi
|
||||
}
|
||||
|
||||
is_multi_node() {
|
||||
local cmds="$1"
|
||||
# Primary signal: NUM_NODES environment variable set by the pipeline
|
||||
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
|
||||
return 0
|
||||
fi
|
||||
# Fallback: detect the bracket syntax structurally
|
||||
# Pattern: [...] && [...] (per-node command arrays)
|
||||
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
handle_pytest_exit() {
|
||||
local exit_code=$1
|
||||
if [ "$exit_code" -eq 5 ]; then
|
||||
echo "Pytest exit code 5 (no tests collected) - treating as success."
|
||||
exit 0
|
||||
fi
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Pytest marker/keyword re-quoting
|
||||
#
|
||||
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
|
||||
# quotes around multi-word pytest -m/-k expressions get stripped:
|
||||
# pytest -v -s -m 'not cpu_test' v1/core
|
||||
# becomes:
|
||||
# pytest -v -s -m not cpu_test v1/core
|
||||
#
|
||||
# pytest then interprets "cpu_test" as a file path, not part of the marker.
|
||||
#
|
||||
# This function detects unquoted expressions after -m/-k and re-quotes them
|
||||
# by collecting tokens until a recognizable boundary is reached:
|
||||
# - test path (contains '/')
|
||||
# - test file (ends with '.py')
|
||||
# - another pytest flag (--xxx or -x single-char flags)
|
||||
# - command separator (&& || ; |)
|
||||
# - environment variable assignment (FOO=bar)
|
||||
#
|
||||
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
|
||||
# unquoted since they have no spaces and work fine.
|
||||
#
|
||||
# Already-quoted expressions (containing literal single quotes) are passed
|
||||
# through untouched to avoid double-quoting values injected by
|
||||
# apply_rocm_test_overrides.
|
||||
#
|
||||
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
|
||||
# double-quotes stripped by the calling shell (see header comment).
|
||||
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
|
||||
###############################################################################
|
||||
re_quote_pytest_markers() {
|
||||
local input="$1"
|
||||
local output=""
|
||||
local collecting=false
|
||||
local marker_buf=""
|
||||
|
||||
# Strip backslash-newline continuations, then flatten remaining newlines
|
||||
local flat="${input//$'\\\n'/ }"
|
||||
flat="${flat//$'\n'/ }"
|
||||
|
||||
# Disable globbing to prevent *.py etc. from expanding during read -ra
|
||||
local restore_glob
|
||||
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
||||
set -o noglob
|
||||
local -a words
|
||||
read -ra words <<< "$flat"
|
||||
eval "$restore_glob"
|
||||
|
||||
for word in "${words[@]}"; do
|
||||
if $collecting; then
|
||||
# If the token we're about to collect already contains a literal
|
||||
# single quote, the expression was already quoted upstream.
|
||||
# Flush and stop collecting.
|
||||
if [[ "$word" == *"'"* ]]; then
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
# Should not normally happen (partial buf + quote), flush raw
|
||||
output+="${marker_buf} "
|
||||
marker_buf=""
|
||||
fi
|
||||
output+="${word} "
|
||||
collecting=false
|
||||
continue
|
||||
fi
|
||||
|
||||
local is_boundary=false
|
||||
case "$word" in
|
||||
# Line-continuation artifact
|
||||
"\\")
|
||||
is_boundary=true ;;
|
||||
# Command separators
|
||||
"&&"|"||"|";"|"|")
|
||||
is_boundary=true ;;
|
||||
# Long flags (--ignore, --shard-id, etc.)
|
||||
--*)
|
||||
is_boundary=true ;;
|
||||
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
|
||||
# like "not" which don't start with "-". Also skip -k/-m which
|
||||
# would start a new marker (handled below).
|
||||
-[a-zA-Z])
|
||||
is_boundary=true ;;
|
||||
# Test path (contains /)
|
||||
*/*)
|
||||
is_boundary=true ;;
|
||||
# Test file (ends with .py, possibly with ::method)
|
||||
*.py|*.py::*)
|
||||
is_boundary=true ;;
|
||||
# Environment variable assignment preceding a command (FOO=bar)
|
||||
*=*)
|
||||
# Only treat as boundary if it looks like VAR=value, not
|
||||
# pytest filter expressions like num_gpus=2 inside markers
|
||||
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
||||
is_boundary=true
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
if $is_boundary; then
|
||||
# Strip surrounding double quotes if present (from upstream
|
||||
# single-to-double conversion); without this, wrapping below
|
||||
# would produce '"expr"' with literal double-quote characters.
|
||||
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||
marker_buf="${marker_buf#\"}"
|
||||
marker_buf="${marker_buf%\"}"
|
||||
fi
|
||||
# Flush the collected marker expression
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}' "
|
||||
else
|
||||
output+="${marker_buf} "
|
||||
fi
|
||||
collecting=false
|
||||
marker_buf=""
|
||||
# Check if this boundary word itself starts a new -m/-k
|
||||
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
# Drop stray backslash tokens silently
|
||||
elif [[ "$word" == "\\" ]]; then
|
||||
:
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
else
|
||||
# Accumulate into marker buffer
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
marker_buf+=" ${word}"
|
||||
else
|
||||
marker_buf="${word}"
|
||||
fi
|
||||
fi
|
||||
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
marker_buf=""
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
done
|
||||
|
||||
# Flush any trailing marker expression (marker at end of command)
|
||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||
# Strip surrounding double quotes (see mid-stream flush comment)
|
||||
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||
marker_buf="${marker_buf#\"}"
|
||||
marker_buf="${marker_buf%\"}"
|
||||
fi
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}'"
|
||||
else
|
||||
output+="${marker_buf}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "${output% }"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# ROCm-specific pytest command rewrites
|
||||
#
|
||||
# These apply ignore flags and environment overrides for tests that are not
|
||||
# yet supported or behave differently on ROCm hardware. Kept as a single
|
||||
# function so new exclusions are easy to add in one place.
|
||||
###############################################################################
|
||||
|
||||
apply_rocm_test_overrides() {
|
||||
local cmds="$1"
|
||||
|
||||
# --- Model registry filter ---
|
||||
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
# --- LoRA: disable custom paged attention ---
|
||||
if [[ $cmds == *"pytest -v -s lora"* ]]; then
|
||||
cmds=${cmds//"pytest -v -s lora"/"pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
# --- Kernel ignores ---
|
||||
if [[ $cmds == *" kernels/core"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/attention"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/quantization"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/mamba"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/moe"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py"
|
||||
fi
|
||||
|
||||
# --- Entrypoint ignores ---
|
||||
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/chat_completion/test_audio.py \
|
||||
--ignore=entrypoints/openai/completion/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/models/test_models.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
|
||||
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/serve"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
# Clean up escaped newlines from --ignore appends
|
||||
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
|
||||
|
||||
echo "$cmds"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Main
|
||||
###############################################################################
|
||||
|
||||
# --- GPU initialization ---
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# --- Docker housekeeping ---
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
# --- Pull test image ---
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# --- Prepare commands ---
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
# ---- Command source selection ----
|
||||
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
|
||||
# Fall back to $* for backward compatibility, but warn that inner
|
||||
# double-quotes will have been stripped by the calling shell.
|
||||
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
||||
commands="${VLLM_TEST_COMMANDS}"
|
||||
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
||||
else
|
||||
commands="$*"
|
||||
if [[ -z "$commands" ]]; then
|
||||
echo "Error: No test commands provided." >&2
|
||||
echo "Usage:" >&2
|
||||
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
|
||||
echo " Legacy: bash $0 \"commands here\"" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Commands sourced from positional args (legacy mode)"
|
||||
echo "WARNING: Inner double-quotes in the command string may have been"
|
||||
echo " stripped by the calling shell. If you see syntax errors, switch to:"
|
||||
echo " export VLLM_TEST_COMMANDS='your commands here'"
|
||||
echo " bash $0"
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
echo "Raw commands: $commands"
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
||||
|
||||
# Fix quoting before ROCm overrides (so overrides see correct structure)
|
||||
commands=$(re_quote_pytest_markers "$commands")
|
||||
echo "After re-quoting: $commands"
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
commands=$(apply_rocm_test_overrides "$commands")
|
||||
echo "Final commands: $commands"
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/moe"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Verify GPU access
|
||||
# Test that we're launching on the machine that has
|
||||
# proper access to GPUs
|
||||
render_gid=$(getent group render | cut -d: -f3)
|
||||
if [[ -z "$render_gid" ]]; then
|
||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- RDMA device passthrough (conditional) ---
|
||||
# If the host has RDMA devices, pass them through so tests like
|
||||
# test_moriio_connector can access ibverbs. On hosts without RDMA
|
||||
# hardware the tests will gracefully skip via _rdma_available().
|
||||
RDMA_FLAGS=""
|
||||
if [ -d /dev/infiniband ]; then
|
||||
echo "RDMA devices detected on host, enabling passthrough"
|
||||
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
|
||||
else
|
||||
echo "No RDMA devices found on host, RDMA tests will be skipped"
|
||||
fi
|
||||
|
||||
# --- Route: multi-node vs single-node ---
|
||||
if is_multi_node "$commands"; then
|
||||
echo "--- Multi-node job detected"
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
|
||||
# BASH_REMATCH[1] = prefix (everything before first bracket)
|
||||
# BASH_REMATCH[2] = comma-separated node0 commands
|
||||
# BASH_REMATCH[3] = comma-separated node1 commands
|
||||
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
|
||||
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
saved_IFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< "${BASH_REMATCH[2]}"
|
||||
read -ra node1 <<< "${BASH_REMATCH[3]}"
|
||||
IFS=$saved_IFS
|
||||
|
||||
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
|
||||
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
fi
|
||||
|
||||
for i in "${!node0[@]}"; do
|
||||
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
|
||||
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
|
||||
|
||||
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${step_cmd}"
|
||||
composite_command="${composite_command} && ${step_cmd}"
|
||||
done
|
||||
|
||||
/bin/bash -c "${composite_command}"
|
||||
exit_code=$?
|
||||
cleanup_network
|
||||
handle_pytest_exit "$exit_code"
|
||||
else
|
||||
echo "Multi-node job detected but failed to parse bracket command syntax."
|
||||
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
|
||||
echo "Got: $commands"
|
||||
cleanup_network
|
||||
exit 111
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo "--- Single-node job"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
$RDMA_FLAGS \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-e BUILDKITE_PARALLEL_JOB \
|
||||
-e BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
-e "PYTORCH_ROCM_ARCH=" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
|
||||
exit_code=$?
|
||||
handle_pytest_exit "$exit_code"
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
export VLLM_CPU_KVCACHE_SPACE=1
|
||||
export VLLM_CPU_CI_ENV=1
|
||||
# Reduce sub-processes for acceleration
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
export VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
|
||||
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
|
||||
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
|
||||
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
|
||||
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
|
||||
mkdir -p sde
|
||||
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
|
||||
|
||||
wait_for_pid_and_check_log() {
|
||||
local pid="$1"
|
||||
local log_file="$2"
|
||||
local exit_status
|
||||
|
||||
if [ -z "$pid" ] || [ -z "$log_file" ]; then
|
||||
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Waiting for process $pid to finish..."
|
||||
|
||||
# Use the 'wait' command to pause the script until the specific PID exits.
|
||||
# The 'wait' command's own exit status will be that of the waited-for process.
|
||||
if wait "$pid"; then
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Success)."
|
||||
else
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Failure)."
|
||||
fi
|
||||
|
||||
if [ "$exit_status" -ne 0 ]; then
|
||||
echo "Process exited with a non-zero status."
|
||||
echo "--- Last few lines of log file: $log_file ---"
|
||||
tail -n 50 "$log_file"
|
||||
echo "---------------------------------------------"
|
||||
return 1 # Indicate failure based on exit status
|
||||
fi
|
||||
|
||||
echo "No errors detected in log file and process exited successfully."
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test Sky Lake (AVX512F)
|
||||
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
|
||||
PID_TEST_0=$!
|
||||
|
||||
# Test Cascade Lake (AVX512F + VNNI)
|
||||
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
|
||||
PID_TEST_1=$!
|
||||
|
||||
# Test Cooper Lake (AVX512F + VNNI + BF16)
|
||||
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
|
||||
PID_TEST_2=$!
|
||||
|
||||
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
|
||||
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
|
||||
wait_for_pid_and_check_log $PID_TEST_2 test_2.log
|
||||
@@ -1,44 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
export VLLM_CPU_CI_ENV=0
|
||||
export VLLM_CPU_KVCACHE_SPACE=1 # avoid OOM
|
||||
|
||||
echo "--- PP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 --max-model-len=4096 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--result-dir ./test_results \
|
||||
--result-filename tp_pp.json \
|
||||
--save-result \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
|
||||
if [ "$failed_req" -ne 0 ]; then
|
||||
echo "Some requests were failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#echo "--- DP+TP"
|
||||
#vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
|
||||
#server_pid=$!
|
||||
#timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||
#vllm bench serve \
|
||||
# --backend vllm \
|
||||
# --dataset-name random \
|
||||
# --model meta-llama/Llama-3.2-3B-Instruct \
|
||||
# --num-prompts 20 \
|
||||
# --result-dir ./test_results \
|
||||
# --result-filename dp_pp.json \
|
||||
# --save-result \
|
||||
# --endpoint /v1/completions
|
||||
#kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||
#failed_req=$(jq '.failed' ./test_results/dp_pp.json)
|
||||
#if [ "$failed_req" -ne 0 ]; then
|
||||
# echo "Some requests were failed!"
|
||||
# exit 1
|
||||
#fi
|
||||
@@ -5,8 +5,8 @@
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-0-31}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-31}
|
||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
||||
|
||||
@@ -34,18 +34,13 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||
|
||||
# Run quantized model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
|
||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
@@ -43,7 +43,7 @@ function cpu_tests() {
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
@@ -2,19 +2,118 @@
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -euox pipefail
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
# used for TP/PP E2E test
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
IMAGE_NAME="cpu-test-$NUMA_NODE"
|
||||
TIMEOUT_VAL=$1
|
||||
TEST_COMMAND=$2
|
||||
|
||||
# building the docker image
|
||||
echo "--- :docker: Building Docker image"
|
||||
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
|
||||
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
# Note: disable until supports V1
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Run AWQ/GPTQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
|
||||
# online serving: tp+pp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
|
||||
# online serving: tp+dp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
|
||||
@@ -25,5 +25,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
||||
@@ -1,49 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script builds the HPU docker image and runs the offline inference inside the container.
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
#
|
||||
# vllm-gaudi compatibility pinning:
|
||||
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
|
||||
# When upstream vllm changes its API, the plugin may break before it has been updated.
|
||||
# To handle this, the vllm-gaudi repository maintains a file:
|
||||
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
|
||||
# The first line of that file controls what version of vllm is used inside the Docker image:
|
||||
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
|
||||
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
|
||||
# the test to a known-compatible baseline.
|
||||
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
|
||||
set -exuo pipefail
|
||||
|
||||
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
|
||||
VLLM_COMMUNITY_COMMIT=$(curl -s \
|
||||
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
|
||||
| head -1 | tr -d '\n')
|
||||
|
||||
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
|
||||
|
||||
# Try building the docker image
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
cat <<EOF | docker build -t "${image_name}" -f - .
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
|
||||
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
|
||||
# the current checkout (the Buildkite CI commit) is used unchanged.
|
||||
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
|
||||
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
|
||||
fi
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
@@ -64,20 +36,15 @@ EOF
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f "${container_name}" || true; }
|
||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
-e VLLM_SKIP_WARMUP=true \
|
||||
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
'
|
||||
hpu-plugin-v1-test-env \
|
||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||
|
||||
EXITCODE=$?
|
||||
if [ $EXITCODE -eq 0 ]; then
|
||||
|
||||
@@ -1,292 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs tests inside the Intel XPU docker container.
|
||||
# It mirrors the structure of run-amd-test.sh while keeping Intel-specific
|
||||
# container setup and allowing commands to be sourced from YAML or env.
|
||||
#
|
||||
# Command sources (in priority order):
|
||||
# 1) VLLM_TEST_COMMANDS env var (preferred, preserves quoting)
|
||||
# 2) Positional args (legacy)
|
||||
# 3) One or more YAML files with a commands list (test-area style)
|
||||
###############################################################################
|
||||
set -o pipefail
|
||||
|
||||
DRY_RUN=${DRY_RUN:-0}
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
DRY_RUN=1
|
||||
shift
|
||||
fi
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
###############################################################################
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
|
||||
cleanup_docker() {
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
docker image prune -f
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
re_quote_pytest_markers() {
|
||||
local input="$1"
|
||||
local output=""
|
||||
local collecting=false
|
||||
local marker_buf=""
|
||||
|
||||
local flat="${input//$'\n'/ }"
|
||||
local restore_glob
|
||||
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
||||
set -o noglob
|
||||
local -a words
|
||||
read -ra words <<< "$flat"
|
||||
eval "$restore_glob"
|
||||
|
||||
for word in "${words[@]}"; do
|
||||
if $collecting; then
|
||||
if [[ "$word" == *"'"* ]]; then
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
output+="${marker_buf} "
|
||||
marker_buf=""
|
||||
fi
|
||||
output+="${word} "
|
||||
collecting=false
|
||||
continue
|
||||
fi
|
||||
|
||||
local is_boundary=false
|
||||
case "$word" in
|
||||
"&&"|"||"|";"|"|")
|
||||
is_boundary=true ;;
|
||||
--*)
|
||||
is_boundary=true ;;
|
||||
-[a-zA-Z])
|
||||
is_boundary=true ;;
|
||||
*/*)
|
||||
is_boundary=true ;;
|
||||
*.py|*.py::*)
|
||||
is_boundary=true ;;
|
||||
*=*)
|
||||
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
||||
is_boundary=true
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
if $is_boundary; then
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}' "
|
||||
else
|
||||
output+="${marker_buf} "
|
||||
fi
|
||||
collecting=false
|
||||
marker_buf=""
|
||||
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
else
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
marker_buf+=" ${word}"
|
||||
else
|
||||
marker_buf="${word}"
|
||||
fi
|
||||
fi
|
||||
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
marker_buf=""
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
done
|
||||
|
||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}'"
|
||||
else
|
||||
output+="${marker_buf}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "${output% }"
|
||||
}
|
||||
|
||||
apply_intel_test_overrides() {
|
||||
local cmds="$1"
|
||||
# Placeholder for Intel-specific exclusions/overrides.
|
||||
echo "$cmds"
|
||||
}
|
||||
|
||||
is_yaml_file() {
|
||||
local p="$1"
|
||||
[[ -f "$p" && "$p" == *.yaml ]]
|
||||
}
|
||||
|
||||
extract_yaml_commands() {
|
||||
local yaml_path="$1"
|
||||
awk '
|
||||
$1 == "commands:" { in_cmds=1; next }
|
||||
in_cmds && $0 ~ /^[[:space:]]*-[[:space:]]/ {
|
||||
sub(/^[[:space:]]*-[[:space:]]/, "");
|
||||
print;
|
||||
next
|
||||
}
|
||||
in_cmds && $0 ~ /^[^[:space:]]/ { exit }
|
||||
' "$yaml_path"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Main
|
||||
###############################################################################
|
||||
|
||||
default_image_name="${REGISTRY}/${REPO}:${BUILDKITE_COMMIT}-xpu"
|
||||
#default_image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-xpu"
|
||||
image_name="${IMAGE_TAG_XPU:-${default_image_name}}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# ---- Command source selection ----
|
||||
commands=""
|
||||
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
||||
commands="${VLLM_TEST_COMMANDS}"
|
||||
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
||||
elif [[ $# -gt 0 ]]; then
|
||||
all_yaml=true
|
||||
for arg in "$@"; do
|
||||
if ! is_yaml_file "$arg"; then
|
||||
all_yaml=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $all_yaml; then
|
||||
for yaml in "$@"; do
|
||||
mapfile -t COMMANDS < <(extract_yaml_commands "$yaml")
|
||||
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
|
||||
echo "Error: No commands found in ${yaml}" >&2
|
||||
exit 1
|
||||
fi
|
||||
for cmd in "${COMMANDS[@]}"; do
|
||||
if [[ -z "$commands" ]]; then
|
||||
commands="${cmd}"
|
||||
else
|
||||
commands+=" && ${cmd}"
|
||||
fi
|
||||
done
|
||||
done
|
||||
echo "Commands sourced from YAML files: $*"
|
||||
else
|
||||
commands="$*"
|
||||
echo "Commands sourced from positional args (legacy mode)"
|
||||
fi
|
||||
else
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEFAULT_YAML="${SCRIPT_DIR}/intel-test.yaml"
|
||||
if [[ ! -f "${DEFAULT_YAML}" ]]; then
|
||||
echo "Error: YAML file not found: ${DEFAULT_YAML}" >&2
|
||||
exit 1
|
||||
fi
|
||||
mapfile -t COMMANDS < <(extract_yaml_commands "${DEFAULT_YAML}")
|
||||
if [[ ${#COMMANDS[@]} -eq 0 ]]; then
|
||||
echo "Error: No commands found in ${DEFAULT_YAML}" >&2
|
||||
exit 1
|
||||
fi
|
||||
for cmd in "${COMMANDS[@]}"; do
|
||||
if [[ -z "$commands" ]]; then
|
||||
commands="${cmd}"
|
||||
else
|
||||
commands+=" && ${cmd}"
|
||||
fi
|
||||
done
|
||||
echo "Commands sourced from default YAML: ${DEFAULT_YAML}"
|
||||
fi
|
||||
|
||||
if [[ -z "$commands" ]]; then
|
||||
echo "Error: No test commands provided." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Raw commands: $commands"
|
||||
commands=$(re_quote_pytest_markers "$commands")
|
||||
echo "After re-quoting: $commands"
|
||||
commands=$(apply_intel_test_overrides "$commands")
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# Dry-run mode prints final commands and exits before Docker.
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
echo "DRY_RUN=1 set, skipping Docker execution."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Docker housekeeping ---
|
||||
cleanup_docker
|
||||
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# --- Build or pull test image ---
|
||||
IMAGE="${IMAGE_TAG_XPU:-${image_name}}"
|
||||
|
||||
echo "Using image: ${IMAGE}"
|
||||
|
||||
if docker image inspect "${IMAGE}" >/dev/null 2>&1; then
|
||||
echo "Image already exists locally, skipping pull"
|
||||
else
|
||||
echo "Image not found locally, waiting for lock..."
|
||||
|
||||
flock /tmp/docker-pull.lock bash -c "
|
||||
if docker image inspect '${IMAGE}' >/dev/null 2>&1; then
|
||||
echo 'Image already pulled by another runner'
|
||||
else
|
||||
echo 'Pulling image...'
|
||||
timeout 900 docker pull '${IMAGE}'
|
||||
fi
|
||||
"
|
||||
|
||||
echo "Pull step completed"
|
||||
fi
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true
|
||||
docker image rm -f "${image_name}" || true
|
||||
docker system prune -f || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# --- Single-node job ---
|
||||
|
||||
if [[ -z "${ZE_AFFINITY_MASK:-}" ]]; then
|
||||
echo "Warning: ZE_AFFINITY_MASK is not set. Proceeding without device affinity." >&2
|
||||
fi
|
||||
|
||||
docker run \
|
||||
--device /dev/dri:/dev/dri \
|
||||
--net=host \
|
||||
--ipc=host \
|
||||
--privileged \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
-e "HF_TOKEN=${HF_TOKEN:-}" \
|
||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-}" \
|
||||
-e "CMDS=${commands}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
bash -c 'set -e; echo "ZE_AFFINITY_MASK is ${ZE_AFFINITY_MASK:-}"; eval "$CMDS"'
|
||||
@@ -41,7 +41,6 @@ get_config() {
|
||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||
exit 1
|
||||
fi
|
||||
# shellcheck source=/dev/null
|
||||
source "${TEST_RUN_CONFIG_FILE}"
|
||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||
return 0
|
||||
@@ -49,8 +48,9 @@ get_config() {
|
||||
|
||||
# get test running configuration.
|
||||
fetch_vllm_test_cfg
|
||||
get_config
|
||||
# Check if the function call was successful. If not, exit the script.
|
||||
if ! get_config; then
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||
echo "agent_idx: ${agent_idx}"
|
||||
builder_name="cachebuilder${agent_idx}"
|
||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||
mkdir -p "${builder_cache_dir}"
|
||||
mkdir -p ${builder_cache_dir}
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
|
||||
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
|
||||
--cache-to type=local,dest="${builder_cache_dir}",mode=max \
|
||||
--progress=plain --load -t "${image_name}" -f - .
|
||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||
--progress=plain --load -t ${image_name} -f - .
|
||||
FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
|
||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||
|
||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
|
||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||
# returns one argument per line: --device, /dev/davinciX, ...
|
||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||
parse_and_gen_devices() {
|
||||
local input="$1"
|
||||
local index cards_num
|
||||
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
local devices=""
|
||||
local i=0
|
||||
while (( i < cards_num )); do
|
||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||
printf '%s\n' "--device"
|
||||
printf '%s\n' "/dev/davinci${dev_idx}"
|
||||
devices="$devices --device /dev/davinci${dev_idx}"
|
||||
((i++))
|
||||
done
|
||||
|
||||
# trim leading space
|
||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||
# Output devices: assigned to the caller variable
|
||||
printf '%s' "$devices"
|
||||
}
|
||||
|
||||
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||
|
||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||
# the hardware plugin vllm-ascend.
|
||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||
mkdir -p "${model_cache_dir}"
|
||||
mkdir -p ${model_cache_dir}
|
||||
docker run \
|
||||
"${device_args[@]}" \
|
||||
${devices} \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
@@ -177,7 +182,7 @@ docker run \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v "${model_cache_dir}":/root/.cache/modelscope \
|
||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
|
||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
@@ -127,7 +127,7 @@ run_and_track_test() {
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 2 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 3 "test_lora.py" \
|
||||
|
||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.xpu .
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
@@ -33,23 +33,20 @@ docker run \
|
||||
bash -c '
|
||||
set -e
|
||||
echo $ZE_AFFINITY_MASK
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
python3 examples/basic/offline_inference/generate.py --model OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc --block-size 64 --enforce-eager --max-model-len 8192
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Push ROCm nightly base image and nightly image from ECR
|
||||
# to Docker Hub as vllm/vllm-openai-rocm:base-nightly and vllm/vllm-openai-rocm:nightly
|
||||
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
|
||||
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
|
||||
#
|
||||
# Local testing (no push to Docker Hub):
|
||||
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
|
||||
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.
|
||||
|
||||
set -ex
|
||||
|
||||
# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
|
||||
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
|
||||
# Get the base image ECR tag (set by build-rocm-release-image pipeline step)
|
||||
BASE_ORIG_TAG="$(buildkite-agent meta-data get rocm-base-ecr-tag 2>/dev/null || echo "")"
|
||||
if [ -z "$BASE_ORIG_TAG" ]; then
|
||||
echo "WARNING: rocm-base-ecr-tag metadata not found, falling back to commit-based tag"
|
||||
BASE_ORIG_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
|
||||
fi
|
||||
|
||||
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
|
||||
BASE_TAG_NAME="base-nightly"
|
||||
TAG_NAME="nightly"
|
||||
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
|
||||
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"
|
||||
|
||||
echo "Pushing ROCm base image from ECR: $BASE_ORIG_TAG"
|
||||
echo "Pushing ROCm release image from ECR tag: $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
|
||||
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"
|
||||
|
||||
# Login to ECR and pull the image built by build-rocm-release-image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull "$BASE_ORIG_TAG"
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"
|
||||
|
||||
# Tag for Docker Hub (base-nightly and base-nightly-<commit>, nightly and nightly-<commit>)
|
||||
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
|
||||
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
|
||||
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
|
||||
echo "[DRY_RUN] Local tags created. Exiting without push."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Push to Docker Hub (docker-login plugin runs before this step in CI)
|
||||
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
|
||||
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
|
||||
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
|
||||
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
|
||||
|
||||
echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
|
||||
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||
|
||||
TAG_VARIANT="$1"
|
||||
if [ -n "$TAG_VARIANT" ]; then
|
||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||
TAG_NAME="$TAG_VARIANT-nightly"
|
||||
else
|
||||
ORIG_TAG_SUFFIX=""
|
||||
TAG_NAME="nightly"
|
||||
fi
|
||||
|
||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||
|
||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:"$TAG_NAME"
|
||||
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
|
||||
@@ -2,17 +2,6 @@
|
||||
|
||||
set -euox pipefail
|
||||
|
||||
# To detect ROCm
|
||||
# Check multiple indicators:
|
||||
if [ -e /dev/kfd ] || \
|
||||
[ -d /opt/rocm ] || \
|
||||
command -v rocm-smi &> /dev/null || \
|
||||
[ -n "${ROCM_HOME:-}" ]; then
|
||||
IS_ROCM=1
|
||||
else
|
||||
IS_ROCM=0
|
||||
fi
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
@@ -37,18 +26,13 @@ for command in "${COMMANDS[@]}"; do
|
||||
echo "$command"
|
||||
done
|
||||
|
||||
|
||||
start_network() {
|
||||
docker network create --subnet=192.168.10.0/24 docker-net
|
||||
}
|
||||
|
||||
start_nodes() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
if [ "$IS_ROCM" -eq 1 ]; then
|
||||
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
|
||||
else
|
||||
GPU_DEVICES='--gpus "device='
|
||||
fi
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
@@ -56,9 +40,7 @@ start_nodes() {
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
if [ "$IS_ROCM" -eq 0 ]; then
|
||||
GPU_DEVICES+='"'
|
||||
fi
|
||||
GPU_DEVICES+='"'
|
||||
|
||||
# start the container in detached mode
|
||||
# things to note:
|
||||
@@ -67,7 +49,7 @@ start_nodes() {
|
||||
# 3. map the huggingface cache directory to the container
|
||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||
# starting from 192.168.10.11)
|
||||
docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
|
||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
||||
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
||||
/bin/bash -c "tail -f /dev/null"
|
||||
|
||||
64
.buildkite/scripts/run-prime-rl-test.sh
Executable file
64
.buildkite/scripts/run-prime-rl-test.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Setup script for Prime-RL integration tests
|
||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||
|
||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Setting up Prime-RL integration test environment..."
|
||||
|
||||
# Clean up any existing Prime-RL directory
|
||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
||||
echo "Removing existing Prime-RL directory..."
|
||||
rm -rf "${PRIME_RL_DIR}"
|
||||
fi
|
||||
|
||||
# Install UV if not available
|
||||
if ! command -v uv &> /dev/null; then
|
||||
echo "Installing UV package manager..."
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
source $HOME/.local/bin/env
|
||||
fi
|
||||
|
||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
||||
PRIME_RL_BRANCH="integ-vllm-main"
|
||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
||||
cd "${PRIME_RL_DIR}"
|
||||
|
||||
echo "Setting up UV project environment..."
|
||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
||||
|
||||
# Remove vllm pin from pyproject.toml
|
||||
echo "Removing vllm pin from pyproject.toml..."
|
||||
sed -i '/vllm==/d' pyproject.toml
|
||||
|
||||
# Sync Prime-RL dependencies
|
||||
echo "Installing Prime-RL dependencies..."
|
||||
uv sync --inexact && uv sync --inexact --all-extras
|
||||
|
||||
# Verify installation
|
||||
echo "Verifying installations..."
|
||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
||||
|
||||
echo "Prime-RL integration test environment setup complete!"
|
||||
|
||||
echo "Running Prime-RL integration tests..."
|
||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
||||
|
||||
echo "Prime-RL integration tests completed!"
|
||||
@@ -43,6 +43,7 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
@@ -51,14 +52,13 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend "$BACK" \
|
||||
--port "$PORT" &
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
# Nightly e2e test for prefetch offloading with a MoE model.
|
||||
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
|
||||
# and validates GSM8K accuracy matches baseline (no offloading).
|
||||
#
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
#
|
||||
# Environment variables:
|
||||
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
|
||||
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||
|
||||
# ── Build optional vllm serve flags ─────────────────────────────────────
|
||||
|
||||
EXTRA_ARGS=()
|
||||
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
|
||||
echo "Using attention backend: ${ATTENTION_BACKEND}"
|
||||
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
vllm serve "$MODEL" \
|
||||
--max-model-len 2048 \
|
||||
--offload-group-size 8 \
|
||||
--offload-num-in-group 2 \
|
||||
--offload-prefetch-step 1 \
|
||||
--offload-params w13_weight w2_weight \
|
||||
--port "$PORT" \
|
||||
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
@@ -44,23 +44,23 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--enable-eplb \
|
||||
--all2all-backend "$BACK" \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
|
||||
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port "$PORT" &
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -18,18 +18,15 @@ wait_for_server() {
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS and platform-specific args based on platform
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
PLATFORM_ARGS=()
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
@@ -46,25 +43,24 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--all2all-backend "$BACK" \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
"${PLATFORM_ARGS[@]}" \
|
||||
--port "$PORT" &
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
|
||||
# evaluation against a local vLLM server.
|
||||
#
|
||||
# Usage:
|
||||
# # Run with defaults (gpt-oss-20b, multi_turn)
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# # Run with gpt-oss-120b and multiple test categories
|
||||
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
|
||||
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
|
||||
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
|
||||
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# Environment variables (all optional, with defaults):
|
||||
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
|
||||
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
|
||||
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
|
||||
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
|
||||
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
|
||||
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
|
||||
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
|
||||
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
|
||||
# BFCL_PORT - Server port (default: 8000)
|
||||
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
|
||||
# BFCL_EXTRA_ARGS - Additional vLLM server args
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ---- Configuration ----
|
||||
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
|
||||
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
|
||||
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
|
||||
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
|
||||
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
|
||||
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
|
||||
TP_SIZE="${BFCL_TP_SIZE:-1}"
|
||||
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
|
||||
PORT="${BFCL_PORT:-8000}"
|
||||
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
|
||||
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
|
||||
|
||||
# Set up output directory
|
||||
if [ -n "$OUTPUT_DIR" ]; then
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
|
||||
fi
|
||||
|
||||
echo "============================================"
|
||||
echo "BFCL Tool Call Correctness Evaluation"
|
||||
echo "============================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Tool parser: $TOOL_CALL_PARSER"
|
||||
echo "API type: $API_TYPE"
|
||||
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
|
||||
echo "Test category: $TEST_CATEGORY"
|
||||
echo "TP size: $TP_SIZE"
|
||||
echo "Max model len: $MAX_MODEL_LEN"
|
||||
echo "Port: $PORT"
|
||||
echo "Num threads: $NUM_THREADS"
|
||||
echo "============================================"
|
||||
|
||||
# ---- Install bfcl-eval if missing ----
|
||||
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
|
||||
echo "Installing bfcl-eval..."
|
||||
pip install "bfcl-eval>=2025.10.20.1,<2026"
|
||||
fi
|
||||
|
||||
# ---- Cleanup handler ----
|
||||
SERVER_PID=""
|
||||
cleanup() {
|
||||
if [ -n "$SERVER_PID" ]; then
|
||||
echo "Stopping vLLM server (pid=$SERVER_PID)..."
|
||||
kill "$SERVER_PID" 2>/dev/null || true
|
||||
wait "$SERVER_PID" 2>/dev/null || true
|
||||
fi
|
||||
# Remove BFCL lock files (created by filelock for thread-safe writes)
|
||||
rm -rf .file_locks/
|
||||
if [ -n "${OUTPUT_DIR:-}" ]; then
|
||||
rm -rf "$OUTPUT_DIR/.file_locks/"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# ---- Start vLLM server ----
|
||||
echo "Starting vLLM server..."
|
||||
|
||||
SERVE_ARGS=(
|
||||
"$MODEL"
|
||||
--port "$PORT"
|
||||
--enable-auto-tool-choice
|
||||
--tool-call-parser "$TOOL_CALL_PARSER"
|
||||
--tensor-parallel-size "$TP_SIZE"
|
||||
--max-model-len "$MAX_MODEL_LEN"
|
||||
--enforce-eager
|
||||
--no-enable-prefix-caching
|
||||
)
|
||||
|
||||
# Append reasoning parser if specified
|
||||
if [ -n "$REASONING_PARSER" ]; then
|
||||
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
|
||||
fi
|
||||
|
||||
# Append any extra args
|
||||
if [ -n "$EXTRA_ARGS" ]; then
|
||||
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
|
||||
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
|
||||
fi
|
||||
|
||||
echo "Command: vllm serve ${SERVE_ARGS[*]}"
|
||||
vllm serve "${SERVE_ARGS[@]}" &
|
||||
SERVER_PID=$!
|
||||
|
||||
# ---- Wait for server to be ready ----
|
||||
echo "Waiting for vLLM server to start (timeout: 600s)..."
|
||||
SECONDS_WAITED=0
|
||||
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
|
||||
if [ $SECONDS_WAITED -ge 600 ]; then
|
||||
echo ""
|
||||
echo "ERROR: vLLM server failed to start within 600s"
|
||||
exit 1
|
||||
fi
|
||||
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
|
||||
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
|
||||
fi
|
||||
sleep 2
|
||||
SECONDS_WAITED=$((SECONDS_WAITED + 2))
|
||||
done
|
||||
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
|
||||
|
||||
# ---- Run BFCL evaluation ----
|
||||
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
|
||||
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
|
||||
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
|
||||
# against our local vLLM server.
|
||||
bfcl_exit_code=0
|
||||
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
|
||||
import os
|
||||
import sys
|
||||
|
||||
model = sys.argv[1]
|
||||
test_category = sys.argv[2]
|
||||
num_threads = int(sys.argv[3])
|
||||
port = sys.argv[4]
|
||||
api_type = sys.argv[5]
|
||||
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
|
||||
|
||||
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
|
||||
os.environ["OPENAI_API_KEY"] = "dummy"
|
||||
os.environ["BFCL_PROJECT_ROOT"] = output_dir
|
||||
|
||||
import bfcl_eval.constants.model_config as bfcl_model_config
|
||||
from bfcl_eval.constants.model_config import ModelConfig
|
||||
from bfcl_eval.model_handler.api_inference.openai_completion import (
|
||||
OpenAICompletionsHandler,
|
||||
)
|
||||
from bfcl_eval.model_handler.api_inference.openai_response import (
|
||||
OpenAIResponsesHandler,
|
||||
)
|
||||
|
||||
if api_type == "responses":
|
||||
handler = OpenAIResponsesHandler
|
||||
else:
|
||||
handler = OpenAICompletionsHandler
|
||||
|
||||
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
|
||||
model_name=model,
|
||||
display_name=f"{model} (FC) (vLLM)",
|
||||
url=f"https://huggingface.co/{model}",
|
||||
org="",
|
||||
license="apache-2.0",
|
||||
model_handler=handler,
|
||||
input_price=None,
|
||||
output_price=None,
|
||||
is_fc_model=True,
|
||||
underscore_to_dot=True,
|
||||
)
|
||||
|
||||
from bfcl_eval.__main__ import evaluate, generate
|
||||
import inspect
|
||||
import typer
|
||||
|
||||
|
||||
def _get_default_kwargs(function):
|
||||
kwargs = {}
|
||||
for k, v in inspect.signature(function).parameters.items():
|
||||
if v.default is not inspect.Parameter.empty:
|
||||
default = v.default
|
||||
if isinstance(default, typer.models.OptionInfo):
|
||||
default = default.default
|
||||
kwargs[k] = default
|
||||
return kwargs
|
||||
|
||||
|
||||
# ---- generate ----
|
||||
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
|
||||
gen_kwargs = _get_default_kwargs(generate)
|
||||
gen_kwargs["model"] = [model]
|
||||
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||
gen_kwargs["skip_server_setup"] = True
|
||||
gen_kwargs["num_threads"] = num_threads
|
||||
generate(**gen_kwargs)
|
||||
|
||||
# ---- evaluate ----
|
||||
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
|
||||
eval_kwargs = _get_default_kwargs(evaluate)
|
||||
eval_kwargs["model"] = [model]
|
||||
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||
evaluate(**eval_kwargs)
|
||||
|
||||
print("=== BFCL evaluation completed successfully ===")
|
||||
PYEOF
|
||||
|
||||
# ---- Upload results to buildkite ----
|
||||
if command -v buildkite-agent &>/dev/null; then
|
||||
if [ $bfcl_exit_code -eq 0 ]; then
|
||||
STYLE="success"
|
||||
STATUS="PASSED"
|
||||
else
|
||||
STYLE="error"
|
||||
STATUS="FAILED"
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
|
||||
### BFCL Tool Call Correctness - ${STATUS}
|
||||
- **Model:** \`${MODEL}\`
|
||||
- **Parser:** \`${TOOL_CALL_PARSER}\`
|
||||
- **API type:** \`${API_TYPE}\`
|
||||
- **Test category:** \`${TEST_CATEGORY}\`
|
||||
EOF
|
||||
|
||||
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
|
||||
# $BFCL_PROJECT_ROOT/score/
|
||||
RESULTS_ROOT="${OUTPUT_DIR:-.}"
|
||||
if [ -d "$RESULTS_ROOT/result" ]; then
|
||||
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
|
||||
fi
|
||||
if [ -d "$RESULTS_ROOT/score" ]; then
|
||||
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
|
||||
fi
|
||||
fi
|
||||
|
||||
exit $bfcl_exit_code
|
||||
@@ -9,11 +9,10 @@ ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
# shellcheck source=/dev/null
|
||||
source "$ENV_FILE"
|
||||
source $ENV_FILE
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "$CONTAINER_NAME" || true;
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
@@ -42,13 +41,13 @@ echo
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
|
||||
--env-file "$ENV_FILE" \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
|
||||
-e MODEL="$MODEL" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name "$CONTAINER_NAME" \
|
||||
--name $CONTAINER_NAME \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
|
||||
@@ -42,21 +42,21 @@ echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
vllm serve "$MODEL" \
|
||||
vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--max-num-seqs "$MAX_NUM_SEQS" \
|
||||
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
|
||||
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||
--no-enable-prefix-caching \
|
||||
--download_dir "$DOWNLOAD_DIR" \
|
||||
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
|
||||
--download_dir $DOWNLOAD_DIR \
|
||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||
|
||||
|
||||
echo "wait for 20 minutes.."
|
||||
echo
|
||||
# sleep 1200
|
||||
# wait for 10 minutes...
|
||||
for _ in {1..120}; do
|
||||
for i in {1..120}; do
|
||||
# TODO: detect other type of errors.
|
||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||
echo "Detected RuntimeError, exiting."
|
||||
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
|
||||
echo
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model "$MODEL" \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len "$INPUT_LEN" \
|
||||
--sonnet-output-len "$OUTPUT_LEN" \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore-eos > "$BM_LOG"
|
||||
|
||||
echo "completed..."
|
||||
|
||||
@@ -1,227 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# trigger-ci-build.sh
|
||||
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
|
||||
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
#
|
||||
# Usage: ./trigger-ci-build.sh [options]
|
||||
#
|
||||
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
|
||||
#
|
||||
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default configuration
|
||||
PIPELINE="ci"
|
||||
DRY_RUN=true
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [options]
|
||||
|
||||
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
|
||||
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
|
||||
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
|
||||
Options:
|
||||
--execute Actually trigger the build (default: dry-run)
|
||||
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
|
||||
--commit Override commit SHA (default: current HEAD)
|
||||
--branch Override branch name (default: current branch)
|
||||
--message Custom build message (default: auto-generated)
|
||||
--help Show this help message
|
||||
|
||||
Prerequisites:
|
||||
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
|
||||
- bk configured: bk configure
|
||||
|
||||
Examples:
|
||||
$(basename "$0") # Dry-run, show what would happen
|
||||
$(basename "$0") --execute # Actually trigger the build
|
||||
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
COMMIT=""
|
||||
BRANCH=""
|
||||
MESSAGE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--pipeline)
|
||||
PIPELINE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--commit)
|
||||
COMMIT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--branch)
|
||||
BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--message)
|
||||
MESSAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if bk CLI is installed
|
||||
if ! command -v bk &>/dev/null; then
|
||||
log_error "Buildkite CLI (bk) is not installed"
|
||||
echo ""
|
||||
echo "Install with:"
|
||||
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
|
||||
echo ""
|
||||
echo "Then configure:"
|
||||
echo " bk configure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current commit and branch if not overridden
|
||||
if [[ -z "$COMMIT" ]]; then
|
||||
COMMIT=$(git rev-parse HEAD)
|
||||
fi
|
||||
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
BRANCH=$(git branch --show-current)
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
# Detached HEAD state - try to get branch from ref
|
||||
BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate default message if not provided
|
||||
if [[ -z "$MESSAGE" ]]; then
|
||||
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
|
||||
MESSAGE="[Manual] ${COMMIT_MSG}"
|
||||
fi
|
||||
|
||||
# Safety check: Verify the commit exists on the remote
|
||||
log_info "Verifying commit exists on remote..."
|
||||
git fetch origin --quiet 2>/dev/null || true
|
||||
|
||||
# Check if commit is reachable from any remote branch
|
||||
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
|
||||
if [[ -z "$REMOTE_BRANCHES" ]]; then
|
||||
log_error "Commit ${COMMIT} does not exist on any remote branch!"
|
||||
echo ""
|
||||
echo "The CI system will fail to checkout this commit."
|
||||
echo "Please push your changes first:"
|
||||
echo ""
|
||||
echo " git push origin ${BRANCH}"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Commit found on remote branches:"
|
||||
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
|
||||
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
|
||||
echo " ... and more"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
log_info "Pipeline: ${PIPELINE}"
|
||||
log_info "Branch: ${BRANCH}"
|
||||
log_info "Commit: ${COMMIT}"
|
||||
log_info "Message: ${MESSAGE}"
|
||||
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
|
||||
echo ""
|
||||
|
||||
# Build the command
|
||||
CMD=(bk build create
|
||||
-y
|
||||
-w
|
||||
-i
|
||||
--pipeline "${PIPELINE}"
|
||||
--commit "${COMMIT}"
|
||||
--branch "${BRANCH}"
|
||||
--message "${MESSAGE}"
|
||||
--env "RUN_ALL=1"
|
||||
--env "NIGHTLY=1"
|
||||
)
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo "=========================================="
|
||||
log_warn "DRY-RUN MODE - No build will be triggered"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Command that would be executed:"
|
||||
echo ""
|
||||
# Escape single quotes in values for safe shell display
|
||||
escape_for_shell() {
|
||||
printf '%s' "$1" | sed "s/'/'\\\\''/g"
|
||||
}
|
||||
echo " bk build create \\"
|
||||
echo " -y \\"
|
||||
echo " -w \\"
|
||||
echo " -i \\"
|
||||
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
|
||||
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
|
||||
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
|
||||
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
|
||||
echo " --env 'RUN_ALL=1' \\"
|
||||
echo " --env 'NIGHTLY=1'"
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
|
||||
echo ""
|
||||
echo " $0 --execute"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Triggering build..."
|
||||
|
||||
# Execute the command - bk will print the URL and open browser
|
||||
"${CMD[@]}"
|
||||
@@ -1,46 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Upload a single wheel to S3 (rename linux -> manylinux).
|
||||
# Index generation is handled separately by generate-and-upload-nightly-index.sh.
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# ========= collect, rename & upload the wheel ==========
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# we also accept params as manylinux tag
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="${1:-manylinux_2_31}"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||
exit 1
|
||||
fi
|
||||
new_wheel="${wheel/linux/$manylinux_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
echo "Renamed wheel to: $wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $version"
|
||||
|
||||
# copy wheel to its own bucket
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
|
||||
echo "Wheel uploaded. Index generation is handled by a separate step."
|
||||
@@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
|
||||
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
|
||||
if [[ -z "$GIT_VERSION" ]]; then
|
||||
echo "[FATAL] Not on a git tag, cannot create release."
|
||||
exit 1
|
||||
else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
|
||||
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||
|
||||
# check pypi token
|
||||
if [[ -z "$PYPI_TOKEN" ]]; then
|
||||
echo "[FATAL] PYPI_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export TWINE_USERNAME="__token__"
|
||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||
fi
|
||||
|
||||
set -x # avoid printing secrets above
|
||||
|
||||
# install twine from pypi
|
||||
python3 -m venv /tmp/vllm-release-env
|
||||
source /tmp/vllm-release-env/bin/activate
|
||||
pip install twine
|
||||
python3 -m twine --version
|
||||
|
||||
# copy release wheels to local directory
|
||||
DIST_DIR=/tmp/vllm-release-dist
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source distribution using setup.py
|
||||
python setup.py sdist --dist-dir=$DIST_DIR
|
||||
ls -la $DIST_DIR
|
||||
|
||||
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
|
||||
echo "Found sdist: $SDIST_FILE"
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||
echo "Wheels and source distribution uploaded to PyPI"
|
||||
@@ -1,151 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Upload ROCm wheels to S3 with proper index generation
|
||||
#
|
||||
# Required environment variables:
|
||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
||||
# S3_BUCKET (default: vllm-wheels)
|
||||
#
|
||||
# S3 path structure:
|
||||
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
||||
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
||||
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== Configuration ========
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
INDICES_OUTPUT_DIR="rocm-indices"
|
||||
PYTHON="${PYTHON_PROG:-python3}"
|
||||
|
||||
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
||||
MANYLINUX_VERSION="manylinux_2_35"
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Configuration"
|
||||
echo "========================================"
|
||||
echo "S3 Bucket: $BUCKET"
|
||||
echo "S3 Path: $ROCM_SUBPATH"
|
||||
echo "Commit: $BUILDKITE_COMMIT"
|
||||
echo "Branch: $BUILDKITE_BRANCH"
|
||||
echo "========================================"
|
||||
|
||||
# ======== Part 0: Setup Python ========
|
||||
|
||||
# Detect if python3.12+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# Use new python from docker
|
||||
# Use --user to ensure files are created with correct ownership (not root)
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ======== Part 1: Collect and prepare wheels ========
|
||||
|
||||
# Collect all wheels
|
||||
mkdir -p all-rocm-wheels
|
||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
|
||||
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||
|
||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||
echo "ERROR: No wheels found to upload!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Rename linux to manylinux in wheel filenames
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
||||
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Wheels to upload:"
|
||||
ls -lh all-rocm-wheels/
|
||||
|
||||
# ======== Part 2: Upload wheels to S3 ========
|
||||
|
||||
echo ""
|
||||
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
done
|
||||
|
||||
# ======== Part 3: Generate and upload indices ========
|
||||
|
||||
# List existing wheels in commit directory
|
||||
echo ""
|
||||
echo "Generating indices..."
|
||||
obj_json="rocm-objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# Use the existing generate-nightly-index.py
|
||||
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
||||
--version "$ROCM_SUBPATH" \
|
||||
--current-objects "$obj_json" \
|
||||
--output-dir "$INDICES_OUTPUT_DIR" \
|
||||
--comment "ROCm commit $BUILDKITE_COMMIT"
|
||||
|
||||
# Upload indices to commit directory
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# Update rocm/nightly/ if on main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Updating rocm/nightly/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
||||
fi
|
||||
|
||||
# Extract version from vLLM wheel and update version-specific index
|
||||
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
|
||||
if [ -n "$VLLM_WHEEL" ]; then
|
||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $VERSION"
|
||||
PURE_VERSION="${VERSION%%+*}"
|
||||
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
||||
echo "Pure version: $PURE_VERSION"
|
||||
|
||||
if [[ "$VERSION" != *"dev"* ]]; then
|
||||
echo "Updating rocm/$PURE_VERSION/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ======== Part 4: Summary ========
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Complete!"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "Wheels available at:"
|
||||
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
echo "Install command (by commit):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Install command (nightly):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
||||
fi
|
||||
echo ""
|
||||
echo "Wheel count: $WHEEL_COUNT"
|
||||
echo "========================================"
|
||||
107
.buildkite/scripts/upload-wheels.sh
Normal file
107
.buildkite/scripts/upload-wheels.sh
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== part 0: setup ========
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
INDICES_OUTPUT_DIR="indices"
|
||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# detect if python3.10+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# use new python from docker
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ========= part 1: collect, rename & upload the wheel ==========
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# we also accept params as manylinux tag
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="${1:-manylinux_2_31}"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||
exit 1
|
||||
fi
|
||||
new_wheel="${wheel/linux/$manylinux_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
echo "Renamed wheel to: $wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $version"
|
||||
pure_version="${version%%+*}"
|
||||
echo "Pure version (without variant): $pure_version"
|
||||
|
||||
# copy wheel to its own bucket
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# ========= part 2: generate and upload indices ==========
|
||||
# generate indices for all existing wheels in the commit directory
|
||||
# this script might be run multiple times if there are multiple variants being built
|
||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
||||
# i.e., one process is generating indices while another is uploading a new wheel
|
||||
# so we need to ensure no time-consuming operations happen below
|
||||
|
||||
# list all wheels in the commit directory
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# call script to generate indicies for all existing wheels
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||
else
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
echo "Uploading indices to overwrite /nightly/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -4,10 +4,8 @@ depends_on:
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -15,11 +13,9 @@ steps:
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
device: b200
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
|
||||
@@ -4,7 +4,6 @@ depends_on:
|
||||
steps:
|
||||
- label: Basic Correctness
|
||||
timeout_in_minutes: 30
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
|
||||
@@ -2,23 +2,18 @@ group: Benchmarks
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Benchmarks
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test
|
||||
timeout_in_minutes: 20
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Attention Benchmarks Smoke Test (B200)
|
||||
device: b200
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- benchmarks/attention_benchmarks/
|
||||
- vllm/v1/attention/
|
||||
commands:
|
||||
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
|
||||
|
||||
@@ -2,212 +2,56 @@ group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Sequence Parallel Correctness Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/
|
||||
- vllm/compilation/
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: Sequence Parallel Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: AsyncTP Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: AsyncTP Correctness Tests (B200)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: Distributed Compile Unit Tests (2xH100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/model_executor/layers
|
||||
- tests/compile/passes/distributed/
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -s -v tests/compile/passes/distributed
|
||||
|
||||
- label: Fusion and Compile Unit Tests (2xB200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||
- tests/compile/passes/test_fusion_attn.py
|
||||
- tests/compile/passes/test_mla_attn_quant_fusion.py
|
||||
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
|
||||
- pytest -v -s tests/compile/passes/test_mla_attn_quant_fusion.py
|
||||
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
# TODO(luka) move to H100 once pass tests run on H100
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E Quick (H100)
|
||||
timeout_in_minutes: 15
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
|
||||
|
||||
- label: Fusion E2E Config Sweep (H100)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 1
|
||||
gpu: b200
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
|
||||
@@ -4,13 +4,11 @@ depends_on:
|
||||
steps:
|
||||
- label: Platform Tests (CUDA)
|
||||
timeout_in_minutes: 15
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
@@ -15,31 +15,10 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed DP Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/v1/distributed
|
||||
- tests/entrypoints/openai/test_multi_api_servers.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
|
||||
|
||||
- label: Distributed Compile + RPC Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
@@ -50,80 +29,61 @@ steps:
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Torchrun + Examples (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace"
|
||||
num_devices: 4
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_torchrun_example.py
|
||||
- tests/distributed/test_torchrun_example_moe.py
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/rl/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 examples/offline_inference/data_parallel.py --enforce-eager
|
||||
# rlhf examples
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
|
||||
|
||||
- label: Distributed DP Tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_utils
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
@@ -131,32 +91,20 @@ steps:
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
|
||||
- label: Distributed Compile + Comm (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
- tests/distributed/test_multiproc_executor.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# test multi-node TP with multiproc executor (simulated on single node)
|
||||
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
device: h100
|
||||
num_devices: 8
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
@@ -172,9 +120,9 @@ steps:
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
device: a100
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
@@ -185,23 +133,26 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H100)
|
||||
timeout_in_minutes: 15
|
||||
device: h100
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
device: b200
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
@@ -210,10 +161,8 @@ steps:
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
no_plugin: true
|
||||
optional: true # TODO: revert once infra issue solved
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -222,69 +171,23 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
device: a100
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- vllm/v1/worker/kv_connector_model_runner_mixin.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -293,24 +196,4 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: RayExecutorV2 (4 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/v1/executor/ray_executor_v2.py
|
||||
- vllm/v1/executor/abstract.py
|
||||
- vllm/v1/executor/multiproc_executor.py
|
||||
- tests/distributed/test_ray_v2_executor.py
|
||||
- tests/distributed/test_ray_v2_executor_e2e.py
|
||||
- tests/distributed/test_pipeline_parallel.py
|
||||
- tests/basic_correctness/test_basic_correctness.py
|
||||
commands:
|
||||
- export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s distributed/test_ray_v2_executor.py
|
||||
- pytest -v -s distributed/test_ray_v2_executor_e2e.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
|
||||
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -k "ray"
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
@@ -4,36 +4,56 @@ depends_on:
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
- label: Prime-RL Integration (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_devices: 1
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
timeout_in_minutes: 15
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
@@ -15,72 +14,13 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: Engine (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/engine/
|
||||
- tests/v1/engine/
|
||||
commands:
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
|
||||
- label: e2e Scheduling (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: e2e Core (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: V1 e2e (2 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
optional: true
|
||||
num_devices: 2
|
||||
- label: V1 e2e + engine
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/e2e
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 e2e (4 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
optional: true
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_4
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 e2e (4xH100)
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
num_devices: 4
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention/backends/utils.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- tests/v1/e2e/test_hybrid_chunked_prefill.py
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
@@ -10,7 +10,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
@@ -25,65 +25,19 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration (API Server openai - Part 1)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
|
||||
- label: Entrypoints Integration (API Server openai - Part 2)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
- pytest -v -s entrypoints/openai/speech_to_text/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (API Server openai - Part 3)
|
||||
timeout_in_minutes: 50
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
|
||||
|
||||
- label: Entrypoints Integration (API Server 2)
|
||||
- label: Entrypoints Integration (API Server)
|
||||
timeout_in_minutes: 130
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/rpc
|
||||
- tests/entrypoints/serve/instrumentator
|
||||
- tests/tool_use
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/serve/instrumentator
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
@@ -95,18 +49,17 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (Responses API)
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai/responses
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/responses
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
|
||||
@@ -4,37 +4,20 @@ depends_on:
|
||||
steps:
|
||||
- label: EPLB Algorithm
|
||||
timeout_in_minutes: 15
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
- tests/distributed/test_eplb_utils.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
- pytest -v -s distributed/test_eplb_utils.py
|
||||
|
||||
- label: EPLB Execution # 17min
|
||||
timeout_in_minutes: 27
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
|
||||
- label: Elastic EP Scaling Test
|
||||
timeout_in_minutes: 20
|
||||
device: h100
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/compilation/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_elastic_ep.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
@@ -2,34 +2,21 @@ group: Kernels
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: vLLM IR Tests
|
||||
timeout_in_minutes: 10
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/"
|
||||
source_file_dependencies:
|
||||
- vllm/ir
|
||||
- vllm/kernels
|
||||
commands:
|
||||
- pytest -v -s tests/ir
|
||||
- pytest -v -s tests/kernels/ir
|
||||
|
||||
- label: Kernels Core Operation Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_concat_mla_q.py
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_concat_mla_q.py
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/utils/flashinfer.py
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -46,7 +33,7 @@ steps:
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 25
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
@@ -56,9 +43,8 @@ steps:
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 5
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
@@ -71,8 +57,8 @@ steps:
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
device: h100
|
||||
num_devices: 1
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
@@ -83,7 +69,7 @@ steps:
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
@@ -91,7 +77,7 @@ steps:
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
gpu: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
@@ -99,17 +85,16 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/v1/attention/selector.py
|
||||
- vllm/platforms/cuda.py
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
@@ -117,7 +102,6 @@ steps:
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
- pytest -v -s tests/kernels/test_top_k_per_row.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
@@ -130,72 +114,4 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion==0.3.3
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
|
||||
- label: Kernels FusedMoE Layer Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_moe_layer.py
|
||||
|
||||
|
||||
- label: Kernels FusedMoE Layer Test (2 B200s)
|
||||
timeout_in_minutes: 90
|
||||
device: b200
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_moe_layer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
@@ -9,24 +9,24 @@ steps:
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
# - label: LM Eval Large Models (4 GPUs)(A100)
|
||||
# device: a100
|
||||
# optional: true
|
||||
# num_devices: 4
|
||||
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
# source_file_dependencies:
|
||||
# - csrc/
|
||||
# - vllm/model_executor/layers/quantization
|
||||
# commands:
|
||||
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
device: h100
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -37,82 +37,10 @@ steps:
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Qwen3.5 Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/qwen3_5.py
|
||||
- vllm/model_executor/models/qwen3_5_mtp.py
|
||||
- vllm/transformers_utils/configs/qwen3_5.py
|
||||
- vllm/transformers_utils/configs/qwen3_5_moe.py
|
||||
- vllm/model_executor/models/qwen3_next.py
|
||||
- vllm/model_executor/models/qwen3_next_mtp.py
|
||||
- vllm/model_executor/layers/fla/ops/
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
optional: true
|
||||
num_devices: 8
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||
|
||||
|
||||
- label: GPQA Eval (GPT-OSS) (H100)
|
||||
timeout_in_minutes: 120
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/evals/gpt_oss/
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
|
||||
|
||||
- label: GPQA Eval (GPT-OSS) (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/evals/gpt_oss/
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
@@ -8,13 +8,13 @@ steps:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_devices: 4
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
@@ -22,13 +22,10 @@ steps:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# Alot of these tests are on the edge of OOMing
|
||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
- pytest -v -s -x lora/test_qwen35_densemodel_lora.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
@@ -2,81 +2,36 @@ group: Miscellaneous
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 Spec Decode
|
||||
timeout_in_minutes: 30
|
||||
- label: V1 Others
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/spec_decode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# TODO: create another `optional` test group for slow tests
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 Sample + Logits
|
||||
timeout_in_minutes: 30
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/sample
|
||||
- tests/v1/logits_processors
|
||||
- tests/v1/test_oracle.py
|
||||
- tests/v1/test_request.py
|
||||
- tests/v1/test_outputs.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 Core + KV + Metrics
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/core
|
||||
- tests/v1/executor
|
||||
- tests/v1/kv_offload
|
||||
- tests/v1/worker
|
||||
- tests/v1/kv_connector/unit
|
||||
- tests/v1/metrics
|
||||
- tests/entrypoints/openai/correctness/test_lmeval.py
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
depends_on: ~
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
device: cpu-small
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
@@ -87,7 +42,6 @@ steps:
|
||||
|
||||
- label: Regression
|
||||
timeout_in_minutes: 20
|
||||
device: h200_18gb
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@@ -105,20 +59,19 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -129,7 +82,7 @@ steps:
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_devices: 2
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
@@ -154,87 +107,59 @@ steps:
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/detokenizer
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s detokenizer
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/test_ray_env.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/reasoning
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
device: cpu-small
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s test_ray_env.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
- VLLM_TEST_MODEL=deepseek-ai/DeepSeek-V2-Lite-Chat pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[TRITON_MLA]
|
||||
- VLLM_TEST_MODEL=Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[FLASH_ATTN]
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Batch Invariance (B200)
|
||||
timeout_in_minutes: 30
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
- VLLM_TEST_MODEL=deepseek-ai/DeepSeek-V2-Lite-Chat pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[TRITON_MLA]
|
||||
- VLLM_TEST_MODEL=Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[FLASH_ATTN]
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
@@ -9,9 +9,9 @@ steps:
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor -m '(not slow_test)'
|
||||
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user