diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index 199c33159..b199e554a 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -1,7 +1,8 @@ name: vllm_ci job_dirs: - - ".buildkite/test_areas" - ".buildkite/image_build" + - ".buildkite/test_areas" + - ".buildkite/hardware_tests" run_all_patterns: - "docker/Dockerfile" - "CMakeLists.txt" diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml new file mode 100644 index 000000000..ea2f37a3b --- /dev/null +++ b/.buildkite/hardware_tests/amd.yaml @@ -0,0 +1,28 @@ +group: Hardware +steps: + - label: "AMD: :docker: build image" + device: amd_cpu + no_plugin: true + commands: + - > + docker build + --build-arg max_jobs=16 + --build-arg REMOTE_VLLM=1 + --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' + --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT + --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" + -f docker/Dockerfile.rocm + --target test + --no-cache + --progress plain . + - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 1 + - exit_status: -10 # Agent was lost + limit: 1 + - exit_status: 1 # Machine occasionally fail + limit: 1 diff --git a/.buildkite/hardware_tests/arm.yaml b/.buildkite/hardware_tests/arm.yaml new file mode 100644 index 000000000..d39ab4a7e --- /dev/null +++ b/.buildkite/hardware_tests/arm.yaml @@ -0,0 +1,8 @@ +group: Hardware +steps: + - label: "Arm CPU Test" + soft_fail: true + device: arm_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh diff --git a/.buildkite/hardware_tests/ascend_npu.yaml b/.buildkite/hardware_tests/ascend_npu.yaml new file mode 100644 index 000000000..acebe88ba --- /dev/null +++ b/.buildkite/hardware_tests/ascend_npu.yaml @@ -0,0 +1,10 @@ +group: Hardware +depends_on: ~ +steps: + - label: "Ascend NPU Test" + soft_fail: true + timeout_in_minutes: 20 + no_plugin: true + device: ascend_npu + commands: + - bash .buildkite/scripts/hardware_ci/run-npu-test.sh diff --git a/.buildkite/hardware_tests/gh200.yaml b/.buildkite/hardware_tests/gh200.yaml new file mode 100644 index 000000000..1b2adcff0 --- /dev/null +++ b/.buildkite/hardware_tests/gh200.yaml @@ -0,0 +1,10 @@ +group: Hardware +steps: + - label: "GH200 Test" + soft_fail: true + device: gh200 + no_plugin: true + optional: true + commands: + - nvidia-smi + - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml new file mode 100644 index 000000000..67018bf2c --- /dev/null +++ b/.buildkite/hardware_tests/intel.yaml @@ -0,0 +1,23 @@ +group: Hardware +depends_on: ~ +steps: + - label: "Intel CPU Test" + soft_fail: true + device: intel_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh + + - label: "Intel HPU Test" + soft_fail: true + device: intel_hpu + no_plugin: true + commands: + - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh + + - label: "Intel GPU Test" + soft_fail: true + device: intel_gpu + no_plugin: true + commands: + - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 9a2384e52..9483ff2f2 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -1,56 +1,254 @@ #!/bin/bash -set -e +set -euo pipefail -if [[ $# -lt 8 ]]; then - echo "Usage: $0 " - exit 1 +# replace invalid characters in Docker image tags and truncate to 128 chars +clean_docker_tag() { + local input="$1" + echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128 +} + +print_usage_and_exit() { + echo "Usage: $0 " + exit 1 +} + +print_instance_info() { + echo "" + echo "=== Debug: Instance Information ===" + # Get IMDSv2 token + if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then + AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown") + INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown") + INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown") + AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown") + echo "AMI ID: ${AMI_ID}" + echo "Instance Type: ${INSTANCE_TYPE}" + echo "Instance ID: ${INSTANCE_ID}" + echo "AZ: ${AZ}" + else + echo "Not running on EC2 or IMDS not available" + fi + # Check for warm cache AMI (marker file baked into custom AMI) + if [[ -f /etc/vllm-ami-info ]]; then + echo "Cache: warm (custom vLLM AMI)" + cat /etc/vllm-ami-info + else + echo "Cache: cold (standard AMI)" + fi + echo "===================================" + echo "" +} + +setup_buildx_builder() { + echo "--- :buildkite: Setting up buildx builder" + if [[ -S "${BUILDKIT_SOCKET}" ]]; then + # Custom AMI with standalone buildkitd - use remote driver for warm cache + echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}" + echo "Using remote driver to connect to buildkitd (warm cache available)" + if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then + echo "Using existing baked-vllm-builder" + docker buildx use baked-vllm-builder + else + echo "Creating baked-vllm-builder with remote driver" + docker buildx create \ + --name baked-vllm-builder \ + --driver remote \ + --use \ + "unix://${BUILDKIT_SOCKET}" + fi + docker buildx inspect --bootstrap + elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + # Existing builder available + echo "Using existing builder: ${BUILDER_NAME}" + docker buildx use "${BUILDER_NAME}" + docker buildx inspect --bootstrap + else + # No local buildkitd, no existing builder - create new docker-container builder + echo "No local buildkitd found, using docker-container driver" + docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use + docker buildx inspect --bootstrap + fi + + # builder info + echo "Active builder:" + docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls +} + +check_and_skip_if_image_exists() { + if [[ -n "${IMAGE_TAG:-}" ]]; then + echo "--- :mag: Checking if image exists" + if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then + echo "Image already exists: ${IMAGE_TAG}" + echo "Skipping build" + exit 0 + fi + echo "Image not found, proceeding with build" + fi +} + +ecr_login() { + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" + aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com +} + +prepare_cache_tags() { + # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN + TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache" + MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache" + + if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then + if [[ "$BUILDKITE_BRANCH" == "main" ]]; then + cache="${MAIN_CACHE_ECR}:latest" + else + clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH") + cache="${TEST_CACHE_ECR}:${clean_branch}" + fi + CACHE_TO="$cache" + CACHE_FROM="$cache" + CACHE_FROM_BASE_BRANCH="$cache" + else + CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}" + CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}" + if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then + CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest" + else + clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH") + CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}" + fi + fi + + CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest" + export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN +} + +resolve_parent_commit() { + if [[ -z "${PARENT_COMMIT:-}" ]]; then + PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "") + if [[ -n "${PARENT_COMMIT}" ]]; then + echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}" + export PARENT_COMMIT + else + echo "Could not determine parent commit (may be first commit in repo)" + fi + else + echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}" + fi +} + +print_bake_config() { + echo "--- :page_facing_up: Resolved bake configuration" + BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" + docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true + echo "Saved bake config to ${BAKE_CONFIG_FILE}" + echo "--- :arrow_down: Uploading bake config to Buildkite" + buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" +} + +################################# +# Main Script # +################################# +print_instance_info + +if [[ $# -lt 7 ]]; then + print_usage_and_exit fi +# input args REGISTRY=$1 REPO=$2 BUILDKITE_COMMIT=$3 BRANCH=$4 VLLM_USE_PRECOMPILED=$5 VLLM_MERGE_BASE_COMMIT=$6 -CACHE_FROM=$7 -CACHE_TO=$8 +IMAGE_TAG=$7 +IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional -# authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY -aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com +# build config +TARGET="test-ci" +CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}" +VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}" +BUILDER_NAME="${BUILDER_NAME:-vllm-builder}" +CI_HCL_PATH="/tmp/ci.hcl" +BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock" -# docker buildx -docker buildx create --name vllm-builder --driver docker-container --use -docker buildx inspect --bootstrap -docker buildx ls +prepare_cache_tags +ecr_login -# skip build if image already exists -if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then - echo "Image not found, proceeding with build..." -else - echo "Image found" - exit 0 +# Environment info (for docs and human readers) +# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch) +# VLLM_CI_BRANCH - ci-infra branch to use (default: main) +# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl) +# BUILDER_NAME - Name for buildx builder (default: vllm-builder) +# +# Build configuration (exported as environment variables for bake): +export BUILDKITE_COMMIT +export PARENT_COMMIT +export IMAGE_TAG +export IMAGE_TAG_LATEST +export CACHE_FROM +export CACHE_FROM_BASE_BRANCH +export CACHE_FROM_MAIN +export CACHE_TO +export VLLM_USE_PRECOMPILED +export VLLM_MERGE_BASE_COMMIT + +# print args +echo "--- :mag: Arguments" +echo "REGISTRY: ${REGISTRY}" +echo "REPO: ${REPO}" +echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" +echo "BRANCH: ${BRANCH}" +echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}" +echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}" +echo "IMAGE_TAG: ${IMAGE_TAG}" +echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" + +# print build configuration +echo "--- :mag: Build configuration" +echo "TARGET: ${TARGET}" +echo "CI HCL URL: ${CI_HCL_URL}" +echo "vLLM bake file: ${VLLM_BAKE_FILE}" +echo "BUILDER_NAME: ${BUILDER_NAME}" +echo "CI_HCL_PATH: ${CI_HCL_PATH}" +echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}" + +echo "--- :mag: Cache tags" +echo "CACHE_TO: ${CACHE_TO}" +echo "CACHE_FROM: ${CACHE_FROM}" +echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}" +echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}" + +check_and_skip_if_image_exists + +echo "--- :docker: Setting up Docker buildx bake" +echo "Target: ${TARGET}" +echo "CI HCL URL: ${CI_HCL_URL}" +echo "vLLM bake file: ${VLLM_BAKE_FILE}" + +if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then + echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}" + echo "Make sure you're running from the vLLM repository root" + exit 1 fi -if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then - merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" -else - merge_base_commit_build_args="" -fi +echo "--- :arrow_down: Downloading ci.hcl" +curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}" +echo "Downloaded to ${CI_HCL_PATH}" -# build -docker buildx build --file docker/Dockerfile \ - --build-arg max_jobs=16 \ - --build-arg buildkite_commit=$BUILDKITE_COMMIT \ - --build-arg USE_SCCACHE=1 \ - --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ - --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ - --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ - ${merge_base_commit_build_args} \ - --cache-from type=registry,ref=${CACHE_FROM},mode=max \ - --cache-to type=registry,ref=${CACHE_TO},mode=max \ - --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \ - $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ - --push \ - --target test \ - --progress plain . +setup_buildx_builder + +# Compute parent commit for cache fallback (if not already set) +resolve_parent_commit +export PARENT_COMMIT + +print_bake_config + +echo "--- :docker: Building ${TARGET}" +docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}" + +echo "--- :white_check_mark: Build complete" diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index d01c71dd9..6f601d384 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -4,7 +4,8 @@ steps: key: image-build depends_on: [] commands: - - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO + - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi + - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml index 7dd067c24..4bcf116f2 100644 --- a/.buildkite/test_areas/attention.yaml +++ b/.buildkite/test_areas/attention.yaml @@ -4,7 +4,7 @@ depends_on: steps: - label: V1 attention (H100) timeout_in_minutes: 30 - gpu: h100 + device: h100 source_file_dependencies: - vllm/config/attention.py - vllm/model_executor/layers/attention @@ -15,7 +15,7 @@ steps: - label: V1 attention (B200) timeout_in_minutes: 30 - gpu: b200 + device: b200 source_file_dependencies: - vllm/config/attention.py - vllm/model_executor/layers/attention diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 0ba00925a..3c6f82fdd 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -5,7 +5,7 @@ steps: - label: Fusion and Compile Tests (B200) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" - gpu: b200 + device: b200 source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -26,7 +26,7 @@ steps: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set + # this runner has 2 GPUs available even though num_devices=2 is not set - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml @@ -37,9 +37,9 @@ steps: - label: Fusion E2E (2 GPUs)(B200) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" - gpu: b200 + device: b200 optional: true - num_gpus: 2 + num_devices: 2 source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index c88076bb5..57eabb6e4 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -5,7 +5,7 @@ steps: - label: Distributed Comm Ops timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 source_file_dependencies: - vllm/distributed - tests/distributed @@ -18,7 +18,7 @@ steps: - label: Distributed (2 GPUs) timeout_in_minutes: 90 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 source_file_dependencies: - vllm/compilation/ - vllm/distributed/ @@ -54,7 +54,7 @@ steps: - label: Distributed Tests (4 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/distributed/ - tests/distributed/test_utils @@ -103,8 +103,8 @@ steps: - label: Distributed Tests (8 GPUs)(H100) timeout_in_minutes: 10 - gpu: h100 - num_gpus: 8 + device: h100 + num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - examples/offline_inference/torchrun_dp_example.py @@ -120,9 +120,9 @@ steps: - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) - gpu: a100 + device: a100 optional: true - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/ commands: @@ -133,26 +133,34 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: Distributed Tests (2 GPUs)(H200) - gpu: h200 +- label: Sequence Parallel Tests (H100) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + device: h100 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + # Run sequence parallel tests + - pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + +- label: Distributed Tests (2 GPUs)(H100) + device: h100 optional: true working_dir: "/vllm-workspace/" - num_gpus: 2 + num_devices: 2 commands: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) - gpu: b200 + device: b200 optional: true working_dir: "/vllm-workspace/" - num_gpus: 2 + num_devices: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py @@ -161,8 +169,9 @@ steps: - label: 2 Node Test (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 num_nodes: 2 + no_plugin: true source_file_dependencies: - vllm/distributed/ - vllm/engine/ @@ -176,7 +185,7 @@ steps: - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - tests/v1/kv_connector/nixl_integration/ @@ -184,10 +193,21 @@ steps: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + - label: Pipeline + Context Parallelism (4 GPUs)) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/distributed/ - vllm/engine/ @@ -196,4 +216,46 @@ steps: - tests/distributed/ commands: - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py \ No newline at end of file + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: Hopper Fusion E2E Tests (H100) + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/" + device: h100 + optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + # skip Llama-4 since it does not fit on this device + - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4' + +- label: Hopper Fusion Distributed E2E Tests (2xH100) + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/" + device: h100 + optional: true + num_devices: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 2e0857986..958bff5c9 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -4,27 +4,27 @@ depends_on: steps: - label: DeepSeek V2-Lite Accuracy timeout_in_minutes: 60 - gpu: h100 + device: h100 optional: true - num_gpus: 4 + num_devices: 4 working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - label: Qwen3-30B-A3B-FP8-block Accuracy timeout_in_minutes: 60 - gpu: h100 + device: h100 optional: true - num_gpus: 4 + num_devices: 4 working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200) timeout_in_minutes: 60 - gpu: b200 + device: b200 optional: true - num_gpus: 2 + num_devices: 2 working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 @@ -33,10 +33,11 @@ steps: timeout_in_minutes: 30 optional: true soft_fail: true - num_gpus: 2 + num_devices: 2 working_dir: "/vllm-workspace" source_file_dependencies: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: + - nvidia-smi - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index a028e0e4a..82ce2f420 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -23,4 +23,8 @@ steps: # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e - - pytest -v -s v1/engine + # Run this test standalone for now; + # need to untangle use (implicit) use of spawn/fork across the tests. + - pytest -v -s v1/engine/test_preprocess_error_handling.py + # Run the rest of v1/engine tests + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml index feb825214..9a10476ed 100644 --- a/.buildkite/test_areas/expert_parallelism.yaml +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -14,7 +14,7 @@ steps: - label: EPLB Execution timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/distributed/eplb - tests/distributed/test_eplb_execute.py diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index cf4b646f3..2772d69b4 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -57,8 +57,8 @@ steps: - label: Kernels DeepGEMM Test (H100) timeout_in_minutes: 45 - gpu: h100 - num_gpus: 1 + device: h100 + num_devices: 1 source_file_dependencies: - tools/install_deepgemm.sh - vllm/utils/deep_gemm.py @@ -77,7 +77,7 @@ steps: - label: Kernels (B200) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - gpu: b200 + device: b200 # optional: true source_file_dependencies: - csrc/quantization/fp4/ @@ -114,4 +114,55 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py \ No newline at end of file + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + # e2e + - pytest -v -s tests/models/quantization/test_nvfp4.py + +- label: Kernels Helion Test + timeout_in_minutes: 30 + device: h100 + source_file_dependencies: + - vllm/utils/import_utils.py + - tests/kernels/helion/ + commands: + - pip install helion + - pytest -v -s kernels/helion/ + + +- label: Kernels FP8 MoE Test (1 H100) + timeout_in_minutes: 90 + device: h100 + num_devices: 1 + optional: true + commands: + - pytest -v -s kernels/moe/test_cutlass_moe.py + - pytest -v -s kernels/moe/test_flashinfer.py + - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py + - pytest -v -s kernels/moe/test_moe.py + # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main + - pytest -v -s kernels/moe/test_block_int8.py + - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py + - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py + +- label: Kernels FP8 MoE Test (2 H100s) + timeout_in_minutes: 90 + device: h100 + num_devices: 2 + optional: true + commands: + - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py + - pytest -v -s kernels/moe/test_deepep_moe.py + - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py + # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main + +- label: Kernels Fp4 MoE Test (B200) + timeout_in_minutes: 60 + device: b200 + num_devices: 1 + optional: true + commands: + - pytest -v -s kernels/moe/test_cutedsl_moe.py + - pytest -v -s kernels/moe/test_flashinfer_moe.py + - pytest -v -s kernels/moe/test_nvfp4_moe.py + - pytest -v -s kernels/moe/test_ocp_mx_moe.py diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index e2498512b..cd6bc48b4 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -12,9 +12,9 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 GPUs)(A100) - gpu: a100 + device: a100 optional: true - num_gpus: 4 + num_devices: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ @@ -24,9 +24,9 @@ steps: - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - label: LM Eval Large Models (4 GPUs)(H100) - gpu: h100 + device: h100 optional: true - num_gpus: 4 + num_devices: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ @@ -37,10 +37,39 @@ steps: - label: LM Eval Small Models (B200) timeout_in_minutes: 120 - gpu: b200 + device: b200 optional: true source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + +- label: LM Eval Large Models (H200) + timeout_in_minutes: 60 + device: h200 + optional: true + num_devices: 8 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt + +- label: MoE Refactor Integration Test (H100 - TEMPORARY) + device: h100 + optional: true + num_devices: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt + +- label: MoE Refactor Integration Test (B200 - TEMPORARY) + gpu: b200 + optional: true + num_devices: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt + +- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) + device: b200 + optional: true + num_devices: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 59ade40cc..f034175cc 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -14,7 +14,7 @@ steps: - label: LoRA TP (Distributed) timeout_in_minutes: 30 - num_gpus: 4 + num_devices: 4 source_file_dependencies: - vllm/lora - tests/lora diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index b3b4566ab..0bb5da923 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -31,7 +31,7 @@ steps: source_file_dependencies: - vllm/ - tests/v1 - no_gpu: true + device: cpu commands: # split the test to avoid interference - pytest -v -s -m 'cpu_test' v1/core @@ -82,7 +82,7 @@ steps: - label: Metrics, Tracing (2 GPUs) timeout_in_minutes: 20 - num_gpus: 2 + num_devices: 2 source_file_dependencies: - vllm/ - tests/v1/tracing @@ -127,7 +127,7 @@ steps: - tests/tool_parsers - tests/transformers_utils - tests/config - no_gpu: true + device: cpu commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py @@ -142,7 +142,7 @@ steps: - label: GPT-OSS Eval (B200) timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - gpu: b200 + device: b200 optional: true source_file_dependencies: - tests/evals/gpt_oss @@ -155,7 +155,7 @@ steps: - label: Batch Invariance (H100) timeout_in_minutes: 25 - gpu: h100 + device: h100 source_file_dependencies: - vllm/v1/attention - vllm/model_executor/layers diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 2a86596a6..e6b153854 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -44,7 +44,7 @@ steps: - vllm/ - tests/models/test_utils.py - tests/models/test_vision.py - no_gpu: true + device: cpu commands: - pytest -v -s models/test_utils.py models/test_vision.py diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml index b6bfbf2dd..91306dd09 100644 --- a/.buildkite/test_areas/models_distributed.yaml +++ b/.buildkite/test_areas/models_distributed.yaml @@ -5,7 +5,7 @@ steps: - label: Distributed Model Tests (2 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 source_file_dependencies: - vllm/model_executor/model_loader/sharded_state_loader.py - vllm/model_executor/models/ diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index fc24068c2..0284273a5 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -18,7 +18,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/multimodal - no_gpu: true + device: cpu commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index 60c179aa0..ccc54b47a 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -5,7 +5,7 @@ steps: - label: Plugin Tests (2 GPUs) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 source_file_dependencies: - vllm/plugins/ - tests/plugins/ diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index 6e89d6af3..5ee2e5186 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -16,14 +16,14 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: Quantized MoE Test (B200) timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - gpu: b200 + device: b200 source_file_dependencies: - tests/quantization/test_blackwell_moe.py - vllm/model_executor/models/deepseek_v2.py diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml index cfc5bb20f..3561d5707 100644 --- a/.buildkite/test_areas/weight_loading.yaml +++ b/.buildkite/test_areas/weight_loading.yaml @@ -5,7 +5,7 @@ steps: - label: Weight Loading Multiple GPU # 33min timeout_in_minutes: 45 working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_devices: 2 optional: true source_file_dependencies: - vllm/ @@ -15,8 +15,8 @@ steps: - label: Weight Loading Multiple GPU - Large Models # optional working_dir: "/vllm-workspace/tests" - num_gpus: 2 - gpu: a100 + num_devices: 2 + device: a100 optional: true source_file_dependencies: - vllm/