diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt new file mode 100644 index 000000000..5552391d9 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt @@ -0,0 +1 @@ +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/scripts/check-ray-compatibility.sh b/.buildkite/scripts/check-ray-compatibility.sh index d44d074c2..1572fe941 100644 --- a/.buildkite/scripts/check-ray-compatibility.sh +++ b/.buildkite/scripts/check-ray-compatibility.sh @@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python" WORK_DIR=$(mktemp -d) trap 'rm -rf "$WORK_DIR"' EXIT +# ── Detect PyTorch index URL ───────────────────────────────────────────── + +if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then + ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])") + CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}" + if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then + TORCH_INDEX_URL="${CANDIDATE_URL}" + else + echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}" + echo ">>> Falling back to default PyPI (resolution may be incomplete)" + TORCH_INDEX_URL="" + fi +else + TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129" +fi +echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}" + # Fetch all Ray requirement files used in the LLM depset pipeline echo ">>> Fetching Ray requirement files" RAY_FILES=( @@ -116,6 +133,11 @@ echo "============================================================" echo ">>> Resolving: Can Ray generate compatible lock files?" echo "============================================================" +EXTRA_INDEX_ARGS=() +if [[ -n "${TORCH_INDEX_URL}" ]]; then + EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}") +fi + set +e uv pip compile \ "${WORK_DIR}/requirements.txt" \ @@ -126,7 +148,7 @@ uv pip compile \ -c "${WORK_DIR}/vllm-constraints.txt" \ --python-version 3.12 \ --python-platform x86_64-manylinux_2_31 \ - --extra-index-url https://download.pytorch.org/whl/cu129 \ + "${EXTRA_INDEX_ARGS[@]}" \ --index-strategy unsafe-best-match \ --unsafe-package setuptools \ --unsafe-package ray \ diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh index dddf23f1f..de48eb282 100755 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh @@ -1,11 +1,14 @@ #!/usr/bin/env bash set -euxo pipefail - # Nightly e2e test for prefetch offloading with a MoE model. # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights # and validates GSM8K accuracy matches baseline (no offloading). # # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +# +# Environment variables: +# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN, +# ROCM_ATTN, FLASHINFER). If unset, uses vllm default. THRESHOLD=${1:-0.25} NUM_Q=${2:-1319} PORT=${3:-8030} @@ -22,6 +25,14 @@ wait_for_server() { MODEL="deepseek-ai/DeepSeek-V2-Lite" +# ── Build optional vllm serve flags ───────────────────────────────────── + +EXTRA_ARGS=() +if [[ -n "${ATTENTION_BACKEND:-}" ]]; then + echo "Using attention backend: ${ATTENTION_BACKEND}" + EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}") +fi + cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then kill "${SERVER_PID}" 2>/dev/null || true @@ -40,7 +51,8 @@ vllm serve "$MODEL" \ --offload-num-in-group 2 \ --offload-prefetch-step 1 \ --offload-params w13_weight w2_weight \ - --port "$PORT" & + --port "$PORT" \ + ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} & SERVER_PID=$! wait_for_server "$PORT" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 5e2c25936..82e97bfbb 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -15,7 +15,6 @@ # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for the test. incompatible with command. # mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] -# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 # num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. # num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, # in this case, commands must be specified. the first command runs on the first host, the second @@ -32,6 +31,80 @@ # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. + +##################################################################################################################################### +# # +# README # +# # +##################################################################################################################################### +# # +# IMPORTANT: # +# * Currently AMD CI has MI300 agents, MI325 agents, and MI355 agents. Of those, AMD is using mostly MI325 and MI355. AMD team # +# is actively working on enabling more MI300 machines. All upcoming feature improvements are tracked in: # +# https://github.com/vllm-project/vllm/issues/34994 # +# # +#-----------------------------------------------------------------------------------------------------------------------------------# +# # +# NOTES: # +# * [Pytorch Nightly Dependency Override Check]: if this test fails, it means the nightly torch version is not compatible with # +# some of the dependencies. Please check the error message and add the package to # +# whitelist in `/vllm/tools/pre_commit/generate_nightly_torch_test.py`. # +# * [Entrypoints Integration Test (LLM)]: # +# - {`pytest -v -s entrypoints/llm/test_generate.py`}: It needs a clean process # +# - {`pytest -v -s entrypoints/offline_mode`}: Needs to avoid interference with other tests # +# * [V1 Test e2e + engine]: The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. See discussion here: # +# https://github.com/vllm-project/vllm/pull/31040 # +# * [V1 others]: # +# - Split the tests to avoid interference # +# - Integration test for streaming correctness (requires special branch for __harness__ lib). # +# * [V1 others (CPU)]: Split the tests to avoid interference # +# * [PyTorch Compilation Unit Tests]: Run unit tests defined directly under `compile/`, not including subdirectories, which # +# are usually heavier tests covered elsewhere. Use `find` to launch multiple instances # +# of pytest so that they do not suffer from: # +# https://github.com/vllm-project/vllm/issues/28965 # +# * [PyTorch Fullgraph Smoke Test]: Run smoke tests under fullgraph directory, except `test_full_graph.py` as it is a heavy # +# test that is covered in other steps. Use `find` to launch multiple instances of pytest # +# so that they do not suffer from: https://github.com/vllm-project/vllm/issues/28965 # +# * [PyTorch Fullgraph]: # +# - Limit to no custom ops to reduce running time. Wrap with quotes to escape yaml and avoid starting `-k` string # +# with a `-` # +# - Old E2E tests such as: # +# ```bash # +# pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4' # +# ``` # +# were removed in https://github.com/vllm-project/vllm/pull/33293 in favor of new tests in `fusions_e2e`. We # +# avoid replicating the new jobs in this file as it's deprecated. # +# * [Basic Models Tests (Extra Initialization) %N]: Only when vLLM model source is modified - test initialization of a # +# large subset of supported models (the complement of the small subset in # +# the above test.) Also run if model initialization test file is modified. # +# * [Language Models Tests (Extra Standard) %N]: Shard slow subset of standard language models tests. Only run when model # +# source is modified, or when specified test files are modified. # +# * [Language Models Tests (Hybrid) %N]: Install fast path packages for testing against transformers (mamba, conv1d) and to # +# run plamo2 model in vLLM. # +# * [Language Models Test (Extended Generation)]: Install fast path packages for testing against transformers (mamba, conv1d) # +# and to run plamo2 model in vLLM. # +# * [Multi-Modal Models (Standard)]: # +# - Do NOT remove `VLLM_WORKER_MULTIPROC_METHOD=spawn` setting as ROCm requires this for certain models to function. # +# * [Transformers Nightly Models Test]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock. # +# * [Plugin Tests (2 GPUs)]: # +# - {`pytest -v -s entrypoints/openai/test_oot_registration.py`}: It needs a clean process # +# - {`pytest -v -s models/test_oot_registration.py`}: It needs a clean process # +# - {`pytest -v -s plugins/lora_resolvers`}: Unit tests for in-tree lora resolver plugins # +# * [LoRA TP (Distributed)]: # +# - There is some Tensor Parallelism related processing logic in LoRA that requires multi-GPU testing for validation. # +# - {`pytest -v -s -x lora/test_gptoss_tp.py`}: Disabled for now because MXFP4 backend on non-cuda platform doesn't support # +# LoRA yet. # +# * [Distributed Tests (GPU_TAG)]: Don't test llama model here, it seems hf implementation is buggy. See: # +# https://github.com/vllm-project/vllm/pull/5689 # +# * [Distributed Tests (GPU_TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 in # +# favor of new tests in fusions_e2e. We avoid replicating the new jobs in # +# this file as it's deprecated. # +# # +##################################################################################################################################### + + + + steps: @@ -41,18 +114,25 @@ steps: # # ##################################################################################################################################### -- label: Pytorch Nightly Dependency Override Check # 2min - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Pytorch Nightly Dependency Override Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + soft_fail: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - requirements/nightly_torch_test.txt + - vllm/platforms/rocm.py commands: - bash standalone_tests/pytorch_nightly_dependency.sh -- label: Async Engine, Inputs, Utils, Worker Test # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Async Engine, Inputs, Utils, Worker # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/detokenizer @@ -63,15 +143,20 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + no_gpu: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/test_pooling_params.py + - tests/test_ray_env.py - tests/multimodal - tests/renderers - tests/standalone_tests/lazy_imports.py @@ -79,12 +164,12 @@ steps: - tests/tool_parsers - tests/transformers_utils - tests/config - no_gpu: true commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s test_pooling_params.py + - pytest -v -s test_ray_env.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ @@ -92,22 +177,28 @@ steps: - pytest -v -s transformers_utils - pytest -v -s config -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Python-only Installation # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - tests/standalone_tests/python_only_compile.sh - setup.py + - vllm/platforms/rocm.py commands: - bash standalone_tests/python_only_compile.sh -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Correctness # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/basic_correctness/test_basic_correctness @@ -119,22 +210,25 @@ steps: - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py -- label: Entrypoints Unit Tests # 5min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Entrypoints Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" fast_check: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/entrypoints - tests/entrypoints/ + - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling -- label: Entrypoints Integration Test (LLM) # 30min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Entrypoints Integration (LLM) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 fast_check: true torch_nightly: true @@ -149,30 +243,14 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py - pytest -v -s entrypoints/offline_mode -- label: Entrypoints Integration Test (API Server 1) # 100min - timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - - pytest -v -s entrypoints/test_chat_utils.py -- label: Entrypoints Integration Test (API Server 2) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Entrypoints Integration (API Server 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/rpc @@ -184,29 +262,14 @@ steps: - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - optional: true - fast_check: true - torch_nightly: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/entrypoints/pooling - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling -- label: Entrypoints Integration Test (Responses API) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Entrypoints Integration (Responses API) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/openai/responses @@ -214,103 +277,59 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/responses -- label: Distributed Tests (4 GPUs) # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - - examples/offline_inference/new_weight_syncing/ - - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - - pushd ../examples/offline_inference/new_weight_syncing - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - - popd -- label: Distributed Tests (8 GPUs) # 4min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_8 - optional: true - num_gpus: 8 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py - - vllm/config/parallel.py - - vllm/distributed/ - - vllm/v1/engine/llm_engine.py - - vllm/v1/executor/uniproc_executor.py - - vllm/v1/worker/gpu_worker.py - commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - -- label: EPLB Algorithm Test # 5min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative, amdgfx90a] +- label: EPLB Algorithm # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb - tests/distributed/test_eplb_algo.py + - vllm/platforms/rocm.py commands: - pytest -v -s distributed/test_eplb_algo.py -- label: EPLB Execution Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: EPLB Execution # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_4 num_gpus: 4 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb - tests/distributed/test_eplb_execute.py + - tests/distributed/test_eplb_spec_decode.py + - vllm/platforms/rocm.py commands: - pytest -v -s distributed/test_eplb_execute.py - pytest -v -s distributed/test_eplb_spec_decode.py -- label: Metrics, Tracing Test # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_2 + +- label: Elastic EP Scaling Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/compilation/ + - tests/distributed/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s distributed/test_elastic_ep.py + + +- label: Metrics, Tracing (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 num_gpus: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/v1/tracing @@ -322,9 +341,10 @@ steps: 'opentelemetry-semantic-conventions-ai>=0.4.1'" - pytest -v -s v1/tracing -- label: Regression Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Regression # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -334,10 +354,13 @@ steps: - pip install modelscope - pytest -v -s test_regression.py -- label: Engine Test # 9min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Engine # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/engine @@ -348,730 +371,824 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 Test e2e + engine # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Engine (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/v1/ + - tests/v1/engine/ + - vllm/platforms/rocm.py commands: - - pytest -v -s v1/e2e - - pytest -v -s v1/engine + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py -- label: V1 Test e2e (2 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_2 + +- label: e2e Scheduling (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general/test_async_scheduling.py + + +- label: e2e Core (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + + +- label: Spec Decode Speculators + MTP # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - vllm/transformers_utils/configs/speculators/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" + + +- label: Spec Decode Ngram + Suffix # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" + + +- label: Spec Decode Draft Model # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" + + +- label: V1 e2e (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/e2e commands: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" -- label: V1 Test e2e (4 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" -- label: V1 Test entrypoints # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Entrypoints V1 # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1 commands: - - pytest -v -s v1/entrypoints + - pytest -v -s v1/entrypoints -- label: V1 Test others # 42min + +- label: V1 Sample + Logits # TBD timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py -- label: V1 Test attention (H100) # 10min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py commands: - - pytest -v -s v1/attention + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -- label: Batch Invariance Tests (H100) # 10min - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: V1 Speculative Decoding (slow) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ + - vllm/v1/spec_decode/ + - vllm/model_executor/models/ + - vllm/v1/attention/ + - vllm/model_executor/layers/ + - tests/v1/spec_decode/ + - vllm/platforms/rocm.py commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py -- label: V1 Test others (CPU) # 5 mins - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: V1 attention (H100-MI250) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/attention + + +- label: V1 others (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 no_gpu: true + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1 commands: - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics -- label: Examples Test # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Examples # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints - vllm/multimodal - examples/ + - vllm/platforms/rocm.py commands: - pip install tensorizer - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # Basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # Multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # Features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 -- label: Platform Tests (CUDA) # 4min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Platform Tests (CUDA) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/cuda commands: - - pytest -v -s cuda/test_cuda_context.py - - pytest -v -s cuda/test_platform_no_cuda_init.py + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py -- label: Samplers Test # 56min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Samplers Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py + - vllm/v1/sample/ + - vllm/beam_search.py - tests/samplers - tests/conftest.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s samplers + - pytest -v -s samplers -- label: LoRA Test %N # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: LoRA %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true parallelism: 4 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/lora - tests/lora + - vllm/platforms/rocm.py commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py -- label: PyTorch Compilation Unit Tests # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: PyTorch Compilation Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 torch_nightly: true + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/compile - commands: - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - -- label: PyTorch Compilation Passes Unit Tests - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - source_file_dependencies: - - vllm/ - - tests/compile/passes - commands: - - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - -- label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - torch_nightly: true - source_file_dependencies: - - vllm/ + - vllm/compilation/ + - vllm/model_executor/layers/ + - vllm/v1/worker/ + - vllm/v1/attention/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - csrc/ - tests/compile + - vllm/platforms/rocm.py + commands: + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" + + +- label: PyTorch Fullgraph Smoke Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/config/compilation.py + - csrc/ + - tests/compile + - vllm/platforms/rocm.py commands: - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: PyTorch Fullgraph # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ + - vllm/compilation/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/config/compilation.py + - csrc/ - tests/compile + - vllm/platforms/rocm.py commands: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' -- label: Cudagraph test # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Cudagraph # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - tests/v1/cudagraph - vllm/v1/cudagraph_dispatcher.py - vllm/config/compilation.py - vllm/compilation + - vllm/platforms/rocm.py commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py -- label: Kernels Core Operation Test # 48min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Kernels Core Operation Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - tests/kernels/core - tests/kernels/test_top_k_per_row.py + - tests/kernels/test_concat_mla_q.py + - vllm/model_executor/layers/rotary_embedding/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py + - pytest -v -s kernels/core kernels/test_top_k_per_row.py -- label: Kernels Attention Test %N # 23min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - parallelism: 2 - source_file_dependencies: - - csrc/attention/ - - vllm/v1/attention - - vllm/model_executor/layers/attention - - tests/kernels/attention - commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels Quantization Test %N # 64min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Kernels Mamba Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 optional: true - parallelism: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/layers/quantization - - tests/kernels/quantization - commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - -- label: Kernels MoE Test %N # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - parallelism: 2 - source_file_dependencies: - - csrc/quantization/cutlass_w8a8/moe/ - - csrc/moe/ - - tests/kernels/moe - - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/device_communicators/ - - vllm/envs.py - - vllm/config - commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - -- label: Kernels Mamba Test # 31min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba - vllm/model_executor/layers/mamba/ops + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/mamba + - pytest -v -s kernels/mamba -- label: Kernels Helion Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Kernels Helion Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/utils/import_utils.py - tests/kernels/helion/ + - vllm/platforms/rocm.py commands: - - pip install helion - - pytest -v -s kernels/helion/ + - pip install helion + - pytest -v -s kernels/helion/ -- label: Model Executor Test # 23min - timeout_in_minutes: 35 - torch_nightly: true - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Model Executor # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/engine/arg_utils.py - vllm/config/model.py - vllm/model_executor - tests/model_executor - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py -- label: Benchmarks # 11min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Benchmarks # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ + - vllm/platforms/rocm.py commands: - bash scripts/run-benchmarks.sh -- label: Benchmarks CLI Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Benchmarks CLI Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/benchmarks/ commands: - pytest -v -s benchmarks/ -- label: Quantization Test # 70min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - tests/quantization - commands: - - uv pip install --system torchao==0.14.1 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 53min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - optional: true - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - autorun_on_main: true - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: OpenAI API correctness # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: OpenAI API correctness # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - - tools/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ commands: - bash ../tools/install_torchcodec_rocm.sh || exit 1 - pytest -s entrypoints/openai/correctness/ -- label: Basic Models Tests (Initialization) # 15min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Models Tests (Initialization) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/test_initialization.py + - tests/models/registry.py commands: - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset -- label: Basic Models Tests (Extra Initialization) %N # 15min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Models Tests (Extra Initialization) %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 torch_nightly: true parallelism: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/models/ - - vllm/transformers_utils/ + - vllm/model_executor/layers/ - tests/models/test_initialization.py + - tests/models/registry.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Basic Models Tests (Other) # 15min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Models Tests (Other) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/test_terratorch.py - tests/models/test_transformers.py - tests/models/test_registry.py commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py -- label: Basic Models Test (Other CPU) # 5min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Models Test (Other CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - torch_nightly: true no_gpu: true + optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/test_utils.py - tests/models/test_vision.py commands: - - pytest -v -s models/test_utils.py models/test_vision.py + - pytest -v -s models/test_utils.py models/test_vision.py -- label: Language Models Tests (Standard) # 18min - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/language - commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Language Models Tests (Extra Standard) %N # 27min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Language Models Tests (Extra Standard) %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true torch_nightly: true parallelism: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py - tests/models/language/pooling/test_embedding.py - tests/models/language/generation/test_common.py - tests/models/language/pooling/test_classification.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB + - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Language Models Tests (Hybrid) %N # 50min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - optional: true - torch_nightly: true - parallelism: 2 - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Language Models Test (Extended Generation) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Language Models Test (PPL) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - -- label: Language Models Test (PPL) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/generation_ppl_test commands: - - pytest -v -s models/language/generation_ppl_test + - pytest -v -s models/language/generation_ppl_test -- label: Language Models Test (Extended Pooling) # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Language Models Test (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/pooling commands: - - pytest -v -s models/language/pooling -m 'not core_model' + - pytest -v -s models/language/pooling -m 'not core_model' -- label: Language Models Test (MTEB) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Language Models Test (MTEB) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/pooling_mteb_test commands: - - pytest -v -s models/language/pooling_mteb_test + - pytest -v -s models/language/pooling_mteb_test -- label: Multi-Modal Processor Test (CPU) # 15min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Multi-Modal Processor (CPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 no_gpu: true + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal - tests/models/registry.py commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py -- label: Multi-Modal Processor Test # 44min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - source_file_dependencies: - - vllm/ - - tests/models/multimodal - - tests/models/registry.py - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing -- label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 100 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py - - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - -- label: Multi-Modal Accuracy Eval (Small Models) # 5min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] +- label: Multi-Modal Accuracy Eval (Small Models) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 + optional: true working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - vllm/multimodal/ - vllm/inputs/ - vllm/v1/core/ + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 -- label: Multi-Modal Models Test (Extended Generation 1) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model + + +- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model + + +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model + + +- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation - tests/models/multimodal/test_mapping.py commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model -- label: Multi-Modal Models Test (Extended Generation 2) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Multi-Modal Models (Extended Generation 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal/test_mapping.py + + +- label: Multi-Modal Models (Extended Generation 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -- label: Multi-Modal Models Test (Extended Generation 3) # 75min - timeout_in_minutes: 150 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Multi-Modal Models (Extended Generation 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -- label: Multi-Modal Models Test (Extended Pooling) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/pooling commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -v -s models/multimodal/pooling -m 'not core_model' + - pytest -v -s models/multimodal/pooling -m 'not core_model' -- label: Quantized Models Test # 45 min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Distributed Comm Ops # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 - source_file_dependencies: - - vllm/model_executor/layers/quantization - - tests/models/quantization - commands: - - pytest -v -s models/quantization - -- label: Transformers Nightly Models Test # 60 min - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - working_dir: "/vllm-workspace/" - optional: true - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/offline_inference/basic/chat.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - -- label: Distributed Comm Ops Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_2 num_gpus: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed - tests/distributed + - vllm/platforms/rocm.py commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_buffer.py - pytest -v -s distributed/test_shm_storage.py -- label: 2 Node Tests (4 GPUs in total) # 16min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdmultinode, amdgfx90a] - agent_pool: mi250_4 - optional: true + +- label: Distributed DP Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 num_gpus: 2 - num_nodes: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - vllm/platforms/rocm.py commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py -- label: Distributed Tests (2 GPUs) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Distributed Compile + RPC Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 - optional: true num_gpus: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/compilation/ @@ -1083,40 +1200,58 @@ steps: - vllm/v1/worker/ - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - - examples/offline_inference/new_weight_syncing/ + - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py + + +- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/distributed/ + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Distributed Model Tests (2 GPUs) # 37min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Distributed Model Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 - optional: true num_gpus: 2 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/model_loader/sharded_state_loader.py - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py - tests/basic_correctness/ - tests/model_executor/model_loader/test_sharded_state_loader.py - tests/models/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py @@ -1125,46 +1260,52 @@ steps: - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' -- label: Plugin Tests (2 GPUs) # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Plugin Tests (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 num_gpus: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/plugins/ - tests/plugins/ + - vllm/platforms/rocm.py commands: - # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + # BEGIN: platform plugin and general plugin tests, all the code in-between runs on dummy platform - pip install -e ./plugins/vllm_add_dummy_platform - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + # END: platform plugin tests + # BEGIN: `io_processor` plugins test, all the code in between uses the `prithvi_io_processor` plugin - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y - # test bge_m3_sparse io_processor plugin + # END: `io_processor` plugins test + # BEGIN: `bge_m3_sparse io_processor` test - pip install -e ./plugins/bge_m3_sparse_plugin - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py - pip uninstall bge_m3_sparse_plugin -y - # end io_processor plugins test - # begin stat_logger plugins test + # END: `bge_m3_sparse io_processor` test + # BEGIN: `stat_logger` plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger - pytest -v -s plugins_tests/test_stats_logger_plugins.py - pip uninstall dummy_stat_logger -y - # end stat_logger plugins test - # other tests continue here: + # END: `stat_logger` plugins test + # BEGIN: other tests - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py - pytest -v -s models/test_oot_registration.py - pytest -v -s plugins/lora_resolvers + # END: other tests -- label: Pipeline + Context Parallelism Test # 45min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Pipeline + Context Parallelism (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_4 num_gpus: 4 working_dir: "/vllm-workspace/tests" @@ -1173,325 +1314,130 @@ steps: - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py - tests/distributed/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py -- label: LoRA TP Test (Distributed) # 17 min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Ray Dependency Compatibility Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_1 + working_dir: "/" + source_file_dependencies: + - requirements/ + - setup.py + - vllm/platforms/rocm.py + commands: + - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh + + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_4 num_gpus: 4 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/lora - - tests/lora + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh -- label: Weight Loading Multiple GPU Test # 33min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 num_gpus: 2 - optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/weight_loading + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/v1/worker/kv_connector_model_runner_mixin.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh -- label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: Distributed Tests (2 GPUs)(H100-MI250) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_2 num_gpus: 2 - optional: true - working_dir: "/vllm-workspace/tests" + working_dir: "/vllm-workspace/" source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - -- label: NixlConnector PD accuracy tests (Distributed) # 30min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - num_gpus: 4 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: Distributed Tests (A100) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/ + - vllm/distributed/ + - vllm/v1/distributed/ + - vllm/model_executor/layers/fused_moe/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/distributed/test_context_parallel.py + - tests/v1/distributed/test_dbo.py + - examples/offline_inference/data_parallel.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - -- label: LM Eval Large Models # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -- label: LM Eval Large Models (H100) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 - -- label: Distributed Tests (H200) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_2 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace/" - commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - -- label: LM Eval Small Models (1 Card) # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_1 - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: LM Eval Large Models (4 Card) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -- label: ROCm LM Eval Large Models (8 Card) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_8 - num_gpus: 8 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 - -- label: ROCm GPT-OSS Eval # 80min - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - agent_pool: mi250_1 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - optional: true - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -- label: DeepSeek V2-Lite Accuracy # 70min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 70min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] - agent_pool: mi250_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + - pytest -v -s tests/distributed/test_context_parallel.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - pytest -v -s tests/v1/distributed/test_dbo.py -################################################### -# # -# MI325 test definitions # -# # -################################################### +##################################################################################################################################### +# # +# gfx942 # +# # +##################################################################################################################################### -##### fast check tests ##### - -- label: Pytorch Nightly Dependency Override Check # 2min - # if this test fails, it means the nightly torch version is not compatible with some - # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/pre_commit/generate_nightly_torch_test.py - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - optional: true - soft_fail: true - source_file_dependencies: - - requirements/nightly_torch_test.txt - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Async Engine, Inputs, Utils, Worker Test # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - source_file_dependencies: - - vllm/ - - tests/detokenizer - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s detokenizer - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] +- label: Entrypoints Integration (LLM) # 13.1m + timeout_in_minutes: 22 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - no_gpu: true - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config - -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - optional: true - # grade: Blocking - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking fast_check: true torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - -- label: Entrypoints Unit Tests # 5min - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" - fast_check: true - source_file_dependencies: - - vllm/entrypoints - - tests/entrypoints/ - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - -- label: Entrypoints Integration Test (LLM) # 30min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true source_file_dependencies: - vllm/ - tests/entrypoints/llm @@ -1499,36 +1445,35 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode -- label: Entrypoints Integration Test (API Server 1) # 100min - timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Entrypoints Integration (API Server 1) # 1h 7m + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/test_chat_utils.py -- label: Entrypoints Integration Test (API Server 2) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Entrypoints Integration (API Server 2) #26.9m + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/rpc @@ -1540,15 +1485,14 @@ steps: - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Entrypoints Integration (Pooling) # 22.8m + timeout_in_minutes: 48 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/pooling @@ -1556,61 +1500,50 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling -- label: Entrypoints Integration Test (Responses API) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai/responses - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/responses -- label: Distributed Tests (4 GPUs) # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Distributed Torchrun + Examples (4 GPUs) # TBD + timeout_in_minutes: 80 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 4 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_torchrun_example.py + - tests/distributed/test_torchrun_example_moe.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - examples/rl/ - tests/examples/offline_inference/data_parallel.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + # rlhf examples + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py + + +- label: Distributed DP Tests (4 GPUs) # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_utils + - vllm/platforms/rocm.py commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - export TORCH_NCCL_BLOCKING_WAIT=1 - # test with torchrun tp=2 and external_dp=2 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=2 and pp=2 - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=4 and dp=1 - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2, pp=2 and dp=1 - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=1 and dp=4 with ep - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2 and dp=2 with ep - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with internal dp - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -1618,32 +1551,37 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py + + +- label: Distributed Compile + Comm (4 GPUs) # TBD + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_multiproc_executor.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - # OLD rlhf examples - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - # NEW rlhf examples - - pushd ../examples/rl - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - - popd + - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node -- label: Distributed Tests (8 GPUs) # 4min + +- label: Distributed Tests (8 GPUs)(H100-MI325) # 6.4m timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_8 - optional: true - # grade: Blocking - gpu: h100 num_gpus: 8 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - examples/offline_inference/torchrun_dp_example.py @@ -1652,77 +1590,35 @@ steps: - vllm/v1/engine/llm_engine.py - vllm/v1/executor/uniproc_executor.py - vllm/v1/worker/gpu_worker.py + - vllm/platforms/rocm.py commands: - # test with torchrun tp=2 and dp=4 with ep - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep -- label: EPLB Algorithm Test # 5min - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_algo.py - commands: - - pytest -v -s distributed/test_eplb_algo.py -- label: EPLB Execution Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Elastic EP Scaling Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 - # grade: Blocking - timeout_in_minutes: 20 - working_dir: "/vllm-workspace/tests" num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_execute.py + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/compilation/ + - tests/distributed/ + - vllm/platforms/rocm.py commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py + - pytest -v -s distributed/test_elastic_ep.py -- label: Metrics, Tracing Test # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - # grade: Blocking - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/v1/tracing - commands: - - "pip install \ - 'opentelemetry-sdk>=1.26.0' \ - 'opentelemetry-api>=1.26.0' \ - 'opentelemetry-exporter-otlp>=1.26.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s v1/tracing -##### fast check tests ##### -##### 1 GPU test ##### - -- label: Regression Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] +- label: Engine # 11.3m + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - grade: Blocking - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: Engine Test # 9min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/engine @@ -1733,347 +1629,435 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 Test e2e + engine # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Engine (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/v1/engine/ + - tests/v1/engine/ + - vllm/platforms/rocm.py commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - - pytest -v -s v1/engine + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py -- label: V1 Test e2e (2 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: e2e Scheduling (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general/test_async_scheduling.py + + +- label: e2e Core (1 GPU) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + + +- label: Spec Decode Eagle # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" + + +- label: Spec Decode Speculators + MTP # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - vllm/transformers_utils/configs/speculators/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" + + +- label: Spec Decode Ngram + Suffix # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" + + +- label: Spec Decode Draft Model # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/model_loader/ + - vllm/v1/sample/ + - vllm/model_executor/layers/ + - tests/v1/e2e/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" + + +- label: V1 e2e (2 GPUs) # 7.1m + timeout_in_minutes: 12 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 optional: true - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1/e2e commands: - # Only run tests that need exactly 2 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" -- label: V1 Test e2e (4 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] - # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. - # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + +- label: V1 e2e (4 GPUs) # 52.6m + timeout_in_minutes: 106 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 - optional: true - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1/e2e commands: - # Only run tests that need 4 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" -- label: V1 Test entrypoints # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints -- label: V1 Test others # 42min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # split the test to avoid interference - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -# TODO: Add the "V1 Test attention (MI300)" test group - -- label: V1 Test attention (H100) # 10min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - -- label: Batch Invariance Tests (H100) # 10min - mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - timeout_in_minutes: 25 - gpu: h100 - source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - -- label: V1 Test others (CPU) # 5 mins - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking - optional: true - source_file_dependencies: - - vllm/ - - tests/v1 - no_gpu: true - commands: - # split the test to avoid interference - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - - -- label: Examples Test # 30min +- label: Entrypoints V1 # 25.7m timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + + +- label: V1 Spec Decode # TBD + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/spec_decode + commands: + - pytest -v -s -m 'not slow_test' v1/spec_decode + + +- label: V1 Sample + Logits # TBD + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + # - export HSA_NO_SCRATCH_RECLAIM=1 + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + + +- label: V1 Speculative Decoding (slow) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/model_executor/models/ + - vllm/v1/attention/ + - vllm/model_executor/layers/ + - tests/v1/spec_decode/ + - vllm/platforms/rocm.py + commands: + - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py + + +- label: Acceptance Length Test (Large Models) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/model_executor/models/mlp_speculator.py + - tests/v1/spec_decode/test_acceptance_length.py + - vllm/platforms/rocm.py + commands: + - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 + - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test + + +- label: V1 attention (H100-MI325) # 14.5m + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + commands: + - pytest -v -s v1/attention + + +- label: Batch Invariance (H100-MI325) # 5.2m + timeout_in_minutes: 12 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + + +- label: V1 others (CPU) # 10.4m + timeout_in_minutes: 28 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + no_gpu: true + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples # 24.5m + timeout_in_minutes: 55 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - # grade: Blocking working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints - vllm/multimodal - examples/ + - vllm/platforms/rocm.py commands: - - pip install tensorizer # for tensorizer test - # for basic + - pip install tensorizer + # Basic - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/classify.py - python3 basic/offline_inference/embed.py - python3 basic/offline_inference/score.py - # for multi-modal models + # Multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - # for pooling models + # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # for features demo + # Features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 -- label: Platform Tests (CUDA) # 4min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Platform Tests (CUDA) # 5.0m + timeout_in_minutes: 9 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/cuda commands: - - pytest -v -s cuda/test_cuda_context.py - - pytest -v -s cuda/test_platform_no_cuda_init.py + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py -- label: Samplers Test # 56min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - tests/samplers - - tests/conftest.py - commands: - - pytest -v -s samplers -- label: LoRA Test %N # 20min each - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] +- label: PyTorch Compilation Passes Unit Tests # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py - parallelism: 4 - -##### .buildkite/test_areas/pytorch.yaml ##### -# corresponds to .buildkite/test_areas/pytorch.yaml -- label: PyTorch Compilation Unit Tests # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run unit tests defined directly under compile/, - # not including subdirectories, which are usually heavier - # tests covered elsewhere. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - -# corresponds to .buildkite/test_areas/pytorch.yaml -- label: PyTorch Compilation Passes Unit Tests - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - source_file_dependencies: - - vllm/ - - tests/compile/passes - commands: - # TODO: clean up this comment if not needed. It is used to - # keep track of the tests changes during vLLM IR Ops refactoring. - # Use `find` to launch multiple instances of pytest. - - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - -- label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/compile + - tests/compile/passes commands: - # Run smoke tests under fullgraph directory, except test_full_graph.py - # as it is a heavy test that is covered in other steps. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + - pytest -s -v compile/passes --ignore compile/passes/distributed -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # # Limit to no custom ops to reduce running time - # # Wrap with quotes to escape yaml and avoid starting -k string with a - - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. -- label: Cudagraph test - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Kernels Core Operation Test # 26.8m + timeout_in_minutes: 38 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - source_file_dependencies: - - tests/v1/cudagraph - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - vllm/compilation - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - -- label: Kernels Core Operation Test # 48min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - tests/kernels/core - tests/kernels/test_top_k_per_row.py + - tests/kernels/test_concat_mla_q.py + - vllm/model_executor/layers/rotary_embedding/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py + - pytest -v -s kernels/core kernels/test_top_k_per_row.py -- label: Kernels Attention Test %N # 23min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels Attention Test %N # 17.7m + timeout_in_minutes: 28 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + parallelism: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/attention/ - vllm/v1/attention - # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - vllm/model_executor/layers/attention - tests/kernels/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels Quantization Test %N # 64min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels Quantization Test %N # 15.2m + timeout_in_minutes: 24 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking + parallelism: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization - tests/kernels/quantization + - tests/kernels/quantization/test_rocm_skinny_gemms.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/kernels/ commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels MoE Test %N # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels MoE Test %N # TBD + timeout_in_minutes: 19 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + parallelism: 4 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ @@ -2082,533 +2066,301 @@ steps: - vllm/distributed/device_communicators/ - vllm/envs.py - vllm/config + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels FP8 MoE Test - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels FP8 MoE Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/moe/ + - csrc/quantization/w8a8/cutlass/moe/ + - vllm/model_executor/layers/fused_moe/ + - tests/kernels/moe/test_deepep_moe.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/envs.py commands: - pytest -v -s kernels/moe/test_deepep_moe.py -- label: Kernels Mamba Test # 31min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: ROCm AITER Ops Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - csrc/mamba/ - - tests/kernels/mamba - - vllm/model_executor/layers/mamba/ops + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py + - tests/rocm/aiter/ + - vllm/v1/attention/backends/mla/rocm_aiter_mla.py + - vllm/v1/attention/selector.py commands: - - pytest -v -s kernels/mamba + - pytest -v -s rocm/aiter/ -- label: Kernels DeepGEMM Test (H100) # Nvidia-centric -# Not replicating for CUTLAS & CuTe - timeout_in_minutes: 45 - gpu: h100 - num_gpus: 1 - source_file_dependencies: - - tools/install_deepgemm.sh - - vllm/utils/deep_gemm.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization - - tests/kernels/quantization/test_block_fp8.py - - tests/kernels/moe/test_deepgemm.py - - tests/kernels/moe/test_batched_deepgemm.py - - tests/kernels/attention/test_deepgemm_attention.py - commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py -- label: Kernels Helion Test - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - source_file_dependencies: - - vllm/utils/import_utils.py - - tests/kernels/helion/ - commands: - - pip install helion - - pytest -v -s kernels/helion/ - -- label: Model Executor Test # 23min - timeout_in_minutes: 35 - torch_nightly: true - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - source_file_dependencies: - - vllm/engine/arg_utils.py - - vllm/config/model.py - - vllm/model_executor - - tests/model_executor - - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py - -- label: Benchmarks # 11min +- label: Benchmarks # 8.2m timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + optional: true working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ + - vllm/platforms/rocm.py commands: - bash scripts/run-benchmarks.sh -- label: Benchmarks CLI Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/benchmarks/ - commands: - - pytest -v -s benchmarks/ -- label: Quantization Test # 70min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Quantization # 36.1m + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py - tests/quantization commands: - # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release, and pin a working version of torchao nightly here - - # since torchao nightly is only compatible with torch nightly currently - # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now - # we can only upgrade after this is resolved - # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.14.1 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 53min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Language Models Tests (Standard) # 22.8m + timeout_in_minutes: 38 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - # grade: Blocking - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - autorun_on_main: true - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: OpenAI API correctness # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - - tools/ - commands: # LMEval+Transcription WER check - - bash ../tools/install_torchcodec_rocm.sh || exit 1 - - pytest -s entrypoints/openai/correctness/ - - -##### models test ##### - -- label: Basic Models Tests (Initialization) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_initialization.py - commands: - # Run a subset of model initialization tests - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - -- label: Basic Models Tests (Extra Initialization) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/transformers_utils/ - - tests/models/test_initialization.py - commands: - # Only when vLLM model source is modified - test initialization of a large - # subset of supported models (the complement of the small subset in the above - # test.) Also run if model initialization test file is modified - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Basic Models Tests (Other) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_terratorch.py - - tests/models/test_transformers.py - - tests/models/test_registry.py - commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - -- label: Basic Models Test (Other CPU) # 5min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - timeout_in_minutes: 10 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_utils.py - - tests/models/test_vision.py - no_gpu: true - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - -- label: Language Models Tests (Standard) - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language commands: - # Test standard language models, excluding a subset of slow tests - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Language Models Tests (Extra Standard) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Language Models Tests (Hybrid) %N # 34.9m + timeout_in_minutes: 55 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - tests/models/language/pooling/test_embedding.py - - tests/models/language/generation/test_common.py - - tests/models/language/pooling/test_classification.py - commands: - # Shard slow subset of standard language models tests. Only run when model - # source is modified, or when specified test files are modified - - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - -- label: Language Models Tests (Hybrid) %N - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking - torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/generation commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - # Shard hybrid language model tests - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB -- label: Language Models Test (Extended Generation) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Language Models Test (Extended Generation) # 32.2m + timeout_in_minutes: 55 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/generation commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' -- label: Language Models Test (PPL) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test -- label: Language Models Test (Extended Pooling) # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling - commands: - - pytest -v -s models/language/pooling -m 'not core_model' - -- label: Language Models Test (MTEB) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling_mteb_test - commands: - - pytest -v -s models/language/pooling_mteb_test - -- label: Multi-Modal Processor Test (CPU) - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Multi-Modal Processor # 1h 42m + timeout_in_minutes: 138 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - - tests/models/registry.py - no_gpu: true - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Processor Test # 44min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - optional: true - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal - tests/models/registry.py commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing/test_tensor_schema.py -- label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 100 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py - - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model -- label: Multi-Modal Accuracy Eval (Small Models) # 5min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ + - vllm/ + - tests/models/multimodal commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model -- label: Multi-Modal Models Test (Extended 1) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation - tests/models/multimodal/test_mapping.py commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model -- label: Multi-Modal Models Test (Extended 2) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model -- label: Multi-Modal Models Test (Extended 3) # 75min - timeout_in_minutes: 150 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Multi-Modal Models (Extended Generation 1) # 1h 2m + timeout_in_minutes: 106 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal/test_mapping.py + + +- label: Multi-Modal Models (Extended Generation 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -- label: Multi-Modal Models Test (Extended Pooling) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Multi-Modal Models (Extended Generation 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + + +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/pooling commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -v -s models/multimodal/pooling -m 'not core_model' + - pytest -v -s models/multimodal/pooling -m 'not core_model' -- label: Quantized Models Test # 45 min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Quantized Models Test # 21.4m + timeout_in_minutes: 38 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/layers/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py - tests/models/quantization + - vllm/model_executor/model_loader/ commands: - - pytest -v -s models/quantization + - pytest -v -s models/quantization -- label: Transformers Nightly Models Test - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Transformers Nightly Models # 50.9m + timeout_in_minutes: 102 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/" optional: true - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - - pytest -v -s tests/models/test_transformers.py - # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/basic/offline_inference/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - # Whisper needs spawn method to avoid deadlock - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 working_dir: "/vllm-workspace/" - gpu: b200 source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/multimodal/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/models/ + - examples/ commands: - - nvidia-smi - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/basic/offline_inference/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - -- label: Blackwell GPT-OSS Eval - timeout_in_minutes: 60 +- label: Quantized MoE Test (B200-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 working_dir: "/vllm-workspace/" - gpu: b200 - optional: true # run on nightlies source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -- label: Blackwell Quantized MoE Test - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - tests/quantization/test_blackwell_moe.py + - tests/quantization/test_gfx3xx_moe.py - vllm/model_executor/models/deepseek_v2.py - vllm/model_executor/models/gpt_oss.py - vllm/model_executor/models/llama4.py @@ -2616,65 +2368,49 @@ steps: - vllm/model_executor/layers/quantization/compressed_tensors - vllm/model_executor/layers/quantization/modelopt.py - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/triton_attn.py + - vllm/v1/attention/backends/rocm_attn.py + - vllm/v1/attention/backends/rocm_aiter_fa.py + - vllm/v1/attention/backends/mla/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/layernorm.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ commands: - - pytest -s -v tests/quantization/test_blackwell_moe.py + - pytest -s -v tests/quantization/test_gfx3xx_moe.py -##### 1 GPU test ##### -##### multi gpus test ##### -- label: Distributed Comm Ops Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Distributed DP Tests (2 GPUs) # 56.1m + timeout_in_minutes: 102 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 2 - source_file_dependencies: - - vllm/distributed - - tests/distributed - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - -- label: 2 Node Tests (4 GPUs in total) # 16min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdmultinode] - agent_pool: mi325_4 - optional: true - # grade: Blocking working_dir: "/vllm-workspace/tests" - num_gpus: 2 - num_nodes: 2 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - vllm/platforms/rocm.py commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py -- label: Distributed Tests (2 GPUs) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m + timeout_in_minutes: 102 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/compilation/ - vllm/distributed/ @@ -2685,381 +2421,446 @@ steps: - vllm/v1/worker/ - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - - examples/rl/ + - vllm/platforms/rocm.py commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - export TORCH_NCCL_BLOCKING_WAIT=1 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py + + +- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # 56.1m + timeout_in_minutes: 102 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/distributed/ + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Distributed Model Tests (2 GPUs) # 37min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Distributed Model Tests (2 GPUs) # 19.3m + timeout_in_minutes: 38 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/model_loader/sharded_state_loader.py - vllm/model_executor/models/ + - vllm/model_executor/layers/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py - tests/basic_correctness/ - tests/model_executor/model_loader/test_sharded_state_loader.py - tests/models/ commands: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' -- label: Plugin Tests (2 GPUs) # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - commands: - # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - # test bge_m3_sparse io_processor plugin - - pip install -e ./plugins/bge_m3_sparse_plugin - - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py - - pip uninstall bge_m3_sparse_plugin -y - # end io_processor plugins test - # begin stat_logger plugins test - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - # end stat_logger plugins test - # other tests continue here: - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins -- label: Pipeline + Context Parallelism Test # 45min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] +- label: LoRA TP (Distributed) # 9.8m + timeout_in_minutes: 18 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 - optional: true - # grade: Blocking + num_gpus: 4 working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - -- label: LoRA TP Test (Distributed) # 17 min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - optional: true - # grade: Blocking - num_gpus: 4 source_file_dependencies: - vllm/lora - tests/lora + - vllm/platforms/rocm.py commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # There is some Tensor Parallelism related processing logic in LoRA that - # requires multi-GPU testing for validation. - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - # Disabled for now because MXFP4 backend on non-cuda platform - # doesn't support LoRA yet - #- pytest -v -s -x lora/test_gptoss_tp.py + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py -- label: Weight Loading Multiple GPU Test # 33min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Weight Loading Multiple GPU # 7.5m + timeout_in_minutes: 14 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 2 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt -- label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Weight Loading Multiple GPU - Large Models # 12.6m + timeout_in_minutes: 26 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - # grade: Blocking - working_dir: "/vllm-workspace/tests" num_gpus: 2 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt -- label: NixlConnector PD accuracy tests (Distributed) # 30min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 + +- label: Ray Dependency Compatibility Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 optional: true - # grade: Blocking - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 + working_dir: "/" source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ + - requirements/ + - setup.py + - vllm/platforms/rocm.py commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh -- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # 27.4m + timeout_in_minutes: 44 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 + num_gpus: 4 optional: true - # grade: Blocking - timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/tests" - num_devices: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -##### multi gpus test ##### -##### A100 test ##### - -- label: Distributed Tests (A100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: a100 optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: Distributed Tests (4 GPUs)(A100-MI325) # 20.9m + timeout_in_minutes: 37 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 num_gpus: 4 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - export TORCH_NCCL_BLOCKING_WAIT=1 - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: LM Eval Large Models # optional - gpu: a100 - optional: true - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### FP8 test ##### -- label: LM Eval Large Models (H100) # optional, still use H100 for consistency - gpu: h100 - optional: true - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 - - -##### H200 test ##### -- label: Distributed Tests (H200) # optional - mirror_hardwares: [amdexperimental, amdproduction] +- label: Distributed Tests (2 GPUs)(H100-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 - # grade: Blocking - gpu: h200 + num_gpus: 2 optional: true working_dir: "/vllm-workspace/" - num_gpus: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/v1/distributed/ + - vllm/model_executor/layers/fused_moe/ + - tests/distributed/test_context_parallel.py + - tests/v1/distributed/test_dbo.py + - examples/offline_inference/data_parallel.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - # TODO: this test is not supported on ROCm, there are aiter kernels for this. - # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - # this test is not supported on ROCm - # - pytest -v -s tests/v1/distributed/test_dbo.py + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s tests/distributed/test_context_parallel.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py -##### B200 test ##### -- label: Distributed Tests (B200) # optional - gpu: b200 - optional: true + +- label: Distributed Compile Unit Tests (2xH100-2xMI325) # 14.3m + timeout_in_minutes: 32 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 working_dir: "/vllm-workspace/" - num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/layers + - tests/compile/passes/distributed/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" -##### E2E Eval Tests ##### -- label: LM Eval Small Models (1 Card) # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: LM Eval Small Models # 13.3m + timeout_in_minutes: 23 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - # grade: Blocking + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt -- label: LM Eval Large Models (4 Card) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: a100 + +- label: LM Eval Small Models (B200-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt + + +- label: LM Eval Large Models (H200-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_8 optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/layernorm.py + - csrc/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/evals/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx.txt + + +- label: LM Eval Large Models (4 GPUs)(FP8) # 24.8m + timeout_in_minutes: 42 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 + + +- label: LM Eval Large Models (4 GPUs)(A100-MI325) # 17.3m + timeout_in_minutes: 27 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: ROCm LM Eval Large Models (8 Card) - mirror_hardwares: [amdexperimental, amdproduction] + +- label: ROCm LM Eval Large Models (8 Card) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_8 optional: true num_gpus: 8 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/layernorm.py + - csrc/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 -- label: ROCm GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/tests" - agent_pool: mi325_1 - mirror_hardwares: [amdexperimental, amdproduction] - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt -##### EPLB Accuracy Tests ##### -- label: DeepSeek V2-Lite Accuracy - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - timeout_in_minutes: 60 - gpu: h100 +- label: GPQA Eval (GPT-OSS) (H100-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_2 + num_gpus: 2 optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/fused_moe/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt + + +- label: DeepSeek V2-Lite Accuracy # 6.7m + timeout_in_minutes: 12 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 num_gpus: 4 working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/backends/mla/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 + +- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + num_gpus: 1 working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/backends/mla/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030 + + +- label: Qwen3-30B-A3B-FP8-block Accuracy # 6.4m + timeout_in_minutes: 11 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_4 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 10.9m + timeout_in_minutes: 22 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 - # grade: Blocking - optional: true num_gpus: 4 working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/spec_decode/ + - vllm/distributed/eplb + - vllm/model_executor/layers/fused_moe/ + - vllm/model_executor/layers/quantization/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 @@ -3072,11 +2873,12 @@ steps: ## TODO: Enable the test in this group # # corresponds to .buildkite/test_areas/compile.yaml -# - label: Fusion and Compile Unit Tests (2xMI325 GPUs) -# timeout_in_minutes: 20 -# working_dir: "/vllm-workspace/" -# mirror_hardwares: [amdexperimental, amdproduction, tj] +# - label: Fusion and Compile Unit Tests (2xB200-2xMI325) # TBD +# timeout_in_minutes: 180 +# mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325, tj] # agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs +# num_gpus: 1 +# working_dir: "/vllm-workspace/" # source_file_dependencies: # - csrc/quantization/fp4/ # - vllm/model_executor/layers/quantization/ @@ -3100,205 +2902,91 @@ steps: # # TODO: find out more details # # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile -# corresponds to .buildkite/test_areas/compile.yaml -- label: Fusion E2E Quick (MI325) - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - num_devices: 1 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" - # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" -# corresponds to .buildkite/test_areas/compile.yaml -- label: Fusion E2E Config Sweep (MI325) - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] +- label: Fusion E2E Quick (H100-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - num_devices: 1 + num_gpus: 1 + working_dir: "/vllm-workspace/" source_file_dependencies: - - csrc/quantization/ - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/fusions_e2e/ + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - rocm-smi - # Run just llama3 (fp8) for all config combinations - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" + + +- label: Fusion E2E Config Sweep (H100-MI325) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] + agent_pool: mi325_1 + num_gpus: 1 + working_dir: "/vllm-workspace/" + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - rocm-smi + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" ## There are no ops on ROCm for these tests. ## The test still passes but the logs are not useful. ## fused ops just call torch.ops.symm_mem which ## exists in ROCm even though they don't work -# - label: AsyncTP Correctness Tests (2xMI325 GPUs) -# - label: Fusion E2E TP2 Quick (MI325) -# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) -# - label: Fusion E2E TP2 (MI325) -# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs) +# - label: AsyncTP Correctness Tests (2xH100-2xMI325) +# - label: Fusion E2E TP2 Quick (H100-MI325) +# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100-MI325) +# - label: Fusion E2E TP2 (B200-MI325) +# - label: Sequence Parallel Correctness Tests (2xH100-2xMI325) ##################################################################################################################################### # # -# MI355 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately) # +# gfx950 # # # ##################################################################################################################################### -- label: Pytorch Nightly Dependency Override Check # 2min - # if this test fails, it means the nightly torch version is not compatible with some - # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/pre_commit/generate_nightly_torch_test.py - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] +- label: Entrypoints Integration (API Server 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - soft_fail: true - source_file_dependencies: - - requirements/nightly_torch_test.txt - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Async Engine, Inputs, Utils, Worker Test # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - no_gpu: true - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config - -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true fast_check: true torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - -- label: Entrypoints Unit Tests # 5min - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" - fast_check: true - source_file_dependencies: - - vllm/entrypoints - - tests/entrypoints/ - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - -- label: Entrypoints Integration Test (LLM) # 30min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/llm - - tests/entrypoints/offline_mode - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - -- label: Entrypoints Integration Test (API Server 1) # 100min - timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true source_file_dependencies: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/test_chat_utils.py -- label: Entrypoints Integration Test (API Server 2) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + +- label: Entrypoints Integration (API Server 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 optional: true - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/rpc @@ -3310,14 +2998,14 @@ steps: - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + +- label: Entrypoints Integration (Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/entrypoints/pooling @@ -3325,509 +3013,195 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling -- label: Entrypoints Integration Test (Responses API) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + +- label: Regression # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 optional: true working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai/responses - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/responses - -- label: Distributed Tests (4 GPUs) # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - - examples/rl/ - - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py - commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - - export TORCH_NCCL_BLOCKING_WAIT=1 - # test with torchrun tp=2 and external_dp=2 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=2 and pp=2 - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=4 and dp=1 - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2, pp=2 and dp=1 - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=1 and dp=4 with ep - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2 and dp=2 with ep - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with internal dp - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - # OLD rlhf examples - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - # NEW rlhf examples - - pushd ../examples/rl - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - - popd - -- label: Distributed Tests (8 GPUs) # 4min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_8 - optional: true - gpu: h100 - num_gpus: 8 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py - - vllm/config/parallel.py - - vllm/distributed/ - - vllm/v1/engine/llm_engine.py - - vllm/v1/executor/uniproc_executor.py - - vllm/v1/worker/gpu_worker.py - commands: - # test with torchrun tp=2 and dp=4 with ep - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - -- label: EPLB Algorithm Test # 5min - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_algo.py - commands: - - pytest -v -s distributed/test_eplb_algo.py - -- label: EPLB Execution Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - timeout_in_minutes: 20 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_execute.py - commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - -- label: Metrics, Tracing Test # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 - optional: true - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/v1/tracing - commands: - - "pip install \ - 'opentelemetry-sdk>=1.26.0' \ - 'opentelemetry-api>=1.26.0' \ - 'opentelemetry-exporter-otlp>=1.26.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s v1/tracing - -##### fast check tests ##### -##### 1 GPU test ##### - -- label: Regression Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 source_file_dependencies: - vllm/ - tests/test_regression commands: - pip install modelscope - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 9min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: V1 Spec Decode # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port + - tests/v1/spec_decode commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + - pytest -v -s -m 'not slow_test' v1/spec_decode -- label: V1 Test e2e + engine # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - - pytest -v -s v1/engine - -- label: V1 Test e2e (2 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_2 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # Only run tests that need exactly 2 GPUs - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" - -- label: V1 Test e2e (4 GPUs) # 65min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. - # See discussion here: https://github.com/vllm-project/vllm/pull/31040 - agent_pool: mi355_4 - optional: true - # grade: Blocking - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # Only run tests that need 4 GPUs - - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" - -- label: V1 Test entrypoints # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - -- label: V1 Test others # 42min +- label: V1 Sample + Logits # TBD timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py commands: - # split the test to avoid interference - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py -- label: V1 Test attention (H100) # 10min - mirror_hardwares: [amdexperimental] + +- label: V1 Core + KV + Metrics # TBD + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - timeout_in_minutes: 30 - gpu: h100 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py commands: - - pytest -v -s v1/attention + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/worker + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -- label: Batch Invariance Tests (H100) # 10min - mirror_hardwares: [amdexperimental] + +- label: V1 Speculative Decoding (slow) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - timeout_in_minutes: 25 - gpu: h100 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ + - vllm/v1/spec_decode/ + - vllm/model_executor/models/ + - vllm/v1/attention/ + - vllm/model_executor/layers/ + - tests/v1/spec_decode/ + - vllm/platforms/rocm.py commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py + - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py -- label: V1 Test attention (B200) # 10min - mirror_hardwares: [amdexperimental, amdmi355] + +- label: V1 attention (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - timeout_in_minutes: 30 - gpu: b200 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py commands: - - pytest -v -s v1/attention + - pytest -v -s v1/attention -- label: V1 Test others (CPU) # 5 mins - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + +- label: Examples # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - source_file_dependencies: - - vllm/ - - tests/v1 - no_gpu: true - commands: - # split the test to avoid interference - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - - -- label: Examples Test # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints - vllm/multimodal - examples/ + - vllm/platforms/rocm.py commands: - - pip install tensorizer # for tensorizer test - # for basic - - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - - python3 basic/offline_inference/generate.py --model facebook/opt-125m - - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 basic/offline_inference/classify.py - - python3 basic/offline_inference/embed.py - - python3 basic/offline_inference/score.py - # for multi-modal models - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - # for pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # for features demo - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - pip install tensorizer + # Basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # Multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # Pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # Features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 -- label: Platform Tests (CUDA) # 4min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/cuda - commands: - - pytest -v -s cuda/test_cuda_context.py - - pytest -v -s cuda/test_platform_no_cuda_init.py - -- label: Samplers Test # 56min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - tests/samplers - - tests/conftest.py - commands: - - pytest -v -s samplers - -- label: LoRA Test %N # 20min each - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py - parallelism: 4 - -- label: PyTorch Compilation Unit Tests # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run unit tests defined directly under compile/, - # not including subdirectories, which are usually heavier - # tests covered elsewhere. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - -- label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run smoke tests under fullgraph directory, except test_full_graph.py - # as it is a heavy test that is covered in other steps. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - # grade: Blocking - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # # Limit to no custom ops to reduce running time - # # Wrap with quotes to escape yaml and avoid starting -k string with a - - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - -- label: Cudagraph test - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - tests/v1/cudagraph - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - vllm/compilation - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - -- label: Kernels Core Operation Test # 48min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - csrc/ - - tests/kernels/core - - tests/kernels/test_top_k_per_row.py - commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py - -- label: Kernels Attention Test %N # 23min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels Attention Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + parallelism: 2 optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/attention/ - vllm/v1/attention - # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - vllm/model_executor/layers/attention - tests/kernels/attention + - vllm/_aiter_ops.py + - vllm/envs.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels Quantization Test %N # 64min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] + +- label: Kernels Quantization Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + parallelism: 2 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization - tests/kernels/quantization + - tests/kernels/quantization/test_rocm_skinny_gemms.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/kernels/ commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels MoE Test %N # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels MoE Test %N # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + parallelism: 4 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ @@ -3836,1018 +3210,473 @@ steps: - vllm/distributed/device_communicators/ - vllm/envs.py - vllm/config + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 + - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT -- label: Kernels FP8 MoE Test - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Kernels FP8 MoE Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/moe/ + - csrc/quantization/w8a8/cutlass/moe/ + - vllm/model_executor/layers/fused_moe/ + - tests/kernels/moe/test_deepep_moe.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/envs.py commands: - pytest -v -s kernels/moe/test_deepep_moe.py -- label: Kernels Mamba Test # 31min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - csrc/mamba/ - - tests/kernels/mamba - - vllm/model_executor/layers/mamba/ops - commands: - - pytest -v -s kernels/mamba -- label: Kernels DeepGEMM Test (H100) # Nvidia-centric -# Not replicating for CUTLAS & CuTe - timeout_in_minutes: 45 - gpu: h100 - num_gpus: 1 - source_file_dependencies: - - tools/install_deepgemm.sh - - vllm/utils/deep_gemm.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization - - tests/kernels/quantization/test_block_fp8.py - - tests/kernels/moe/test_deepgemm.py - - tests/kernels/moe/test_batched_deepgemm.py - - tests/kernels/attention/test_deepgemm_attention.py - commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py - -- label: Kernels Helion Test - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Quantization # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/utils/import_utils.py - - tests/kernels/helion/ - commands: - - pip install helion - - pytest -v -s kernels/helion/ - -- label: Model Executor Test # 23min - timeout_in_minutes: 35 - torch_nightly: true - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/engine/arg_utils.py - - vllm/config/model.py - - vllm/model_executor - - tests/model_executor - - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py - -- label: Benchmarks # 11min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/.buildkite" - source_file_dependencies: - - benchmarks/ - commands: - - bash scripts/run-benchmarks.sh - -- label: Benchmarks CLI Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/benchmarks/ - commands: - - pytest -v -s benchmarks/ - -- label: Quantization Test # 70min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release, and pin a working version of torchao nightly here - - # since torchao nightly is only compatible with torch nightly currently - # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now - # we can only upgrade after this is resolved - # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.14.1 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 53min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] + +- label: Language Models Tests (Standard) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - autorun_on_main: true - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: OpenAI API correctness # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - - tools/ - commands: # LMEval+Transcription WER check - - bash ../tools/install_torchcodec_rocm.sh || exit 1 - - pytest -s entrypoints/openai/correctness/ - - -##### models test ##### - -- label: Basic Models Tests (Initialization) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_initialization.py - commands: - # Run a subset of model initialization tests - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - -- label: Basic Models Tests (Extra Initialization) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/transformers_utils/ - - tests/models/test_initialization.py - commands: - # Only when vLLM model source is modified - test initialization of a large - # subset of supported models (the complement of the small subset in the above - # test.) Also run if model initialization test file is modified - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Basic Models Tests (Other) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_terratorch.py - - tests/models/test_transformers.py - - tests/models/test_registry.py - commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - -- label: Basic Models Test (Other CPU) # 5min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true - timeout_in_minutes: 10 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_utils.py - - tests/models/test_vision.py - no_gpu: true - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - -- label: Language Models Tests (Standard) - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language commands: - # Test standard language models, excluding a subset of slow tests - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Language Models Tests (Extra Standard) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - tests/models/language/pooling/test_embedding.py - - tests/models/language/generation/test_common.py - - tests/models/language/pooling/test_classification.py - commands: - # Shard slow subset of standard language models tests. Only run when model - # source is modified, or when specified test files are modified - - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 -- label: Language Models Tests (Hybrid) %N - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] +- label: Language Models Test (Extended Generation) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/generation commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - # Shard hybrid language model tests - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' -- label: Language Models Test (Extended Generation) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' -- label: Language Models Test (PPL) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] +- label: Language Models Test (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test - -- label: Language Models Test (Extended Pooling) # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/language/pooling commands: - - pytest -v -s models/language/pooling -m 'not core_model' + - pytest -v -s models/language/pooling -m 'not core_model' -- label: Language Models Test (MTEB) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - # grade: Blocking - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling_mteb_test - commands: - - pytest -v -s models/language/pooling_mteb_test -- label: Multi-Modal Processor Test (CPU) - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] +- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - no_gpu: true - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Processor Test # 44min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing - -- label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 100 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py - - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model -- label: Multi-Modal Accuracy Eval (Small Models) # 5min - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ + - vllm/ + - tests/models/multimodal commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model -- label: Multi-Modal Models Test (Extended 1) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental] + +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation - tests/models/multimodal/test_mapping.py commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model -- label: Multi-Modal Models Test (Extended 2) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental] + +- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model -- label: Multi-Modal Models Test (Extended 3) # 75min - timeout_in_minutes: 150 - mirror_hardwares: [amdexperimental] + +- label: Multi-Modal Models (Extended Generation 1) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + - tests/models/multimodal/test_mapping.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal/test_mapping.py + + +- label: Multi-Modal Models (Extended Generation 2) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/generation commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -- label: Multi-Modal Models Test (Extended Pooling) # 60min - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental] + +- label: Multi-Modal Models (Extended Generation 3) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/models/multimodal/generation + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + + +- label: Multi-Modal Models (Extended Pooling) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - tests/models/multimodal/pooling commands: - - export MIOPEN_DEBUG_CONV_DIRECT=0 - - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -v -s models/multimodal/pooling -m 'not core_model' + - pytest -v -s models/multimodal/pooling -m 'not core_model' -- label: Quantized Models Test # 45 min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] + +- label: Quantized Models Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + - vllm/model_executor/model_loader/ commands: - - pytest -v -s models/quantization + - pytest -v -s models/quantization -- label: Transformers Nightly Models Test - mirror_hardwares: [amdexperimental] + +- label: Kernels (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 working_dir: "/vllm-workspace/" - optional: true - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - - pytest -v -s tests/models/test_transformers.py - # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/basic/offline_inference/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - # Whisper needs spawn method to avoid deadlock - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - -- label: Blackwell Test (MI355) # 21 min - mirror_hardwares: [amdexperimental, amdmi355] - agent_pool: mi355_1 - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/" - gpu: b200 - # optional: true source_file_dependencies: - csrc/quantization/fp4/ - csrc/attention/mla/ - csrc/quantization/cutlass_w8a8/moe/ - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/attention/backends/mla/cutlass_mla.py - - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/backends/triton_attn.py + - vllm/v1/attention/backends/rocm_attn.py + - vllm/v1/attention/backends/rocm_aiter_fa.py + - vllm/v1/attention/backends/rocm_aiter_unified_attn.py + - vllm/v1/attention/backends/mla/aiter_triton_mla.py + - vllm/v1/attention/backends/mla/rocm_aiter_mla.py - vllm/v1/attention/selector.py - - vllm/platforms/cuda.py + - vllm/platforms/rocm.py + - vllm/_aiter_ops.py commands: - - rocm-smi - - python3 examples/basic/offline_inference/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - ## Quantization - #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - #- pytest -v -s tests/kernels/moe/test_flashinfer.py - #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + - rocm-smi + - python3 examples/basic/offline_inference/chat.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 + +- label: Weight Loading Multiple GPU # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + + +- label: Weight Loading Multiple GPU - Large Models # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + + +- label: Ray Dependency Compatibility Check # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_1 + optional: true + working_dir: "/" + source_file_dependencies: + - requirements/ + - setup.py + - vllm/platforms/rocm.py + commands: + - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh + + +- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + + +- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/v1/worker/kv_connector_model_runner_mixin.py + - tests/v1/kv_connector/nixl_integration/ + - vllm/platforms/rocm.py + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + + +- label: Distributed Tests (2 GPUs)(H100-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + source_file_dependencies: + - vllm/distributed/ + - vllm/v1/distributed/ + - vllm/model_executor/layers/fused_moe/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - tests/distributed/test_context_parallel.py + - tests/v1/distributed/test_dbo.py + - examples/offline_inference/data_parallel.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s tests/distributed/test_context_parallel.py + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py + + +- label: Distributed Compile Unit Tests (2xH100-2xMI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py + - vllm/model_executor/layers + - tests/compile/passes/distributed/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - - nvidia-smi - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - -- label: Blackwell GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - optional: true # run on nightlies +- label: LM Eval Small Models (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_2 + optional: true + working_dir: "/vllm-workspace/tests" source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt + + +- label: LM Eval Large Models (4 GPUs)(FP8) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] + agent_pool: mi355_4 + num_gpus: 4 + optional: true + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4 + + +- label: GPQA Eval (GPT-OSS) (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx955nightly, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - vllm/model_executor/layers/fused_moe/ + - tests/evals/gpt_oss/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt -- label: Blackwell Quantized MoE Test - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - tests/quantization/test_blackwell_moe.py - - vllm/model_executor/models/deepseek_v2.py - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/models/llama4.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization/compressed_tensors - - vllm/model_executor/layers/quantization/modelopt.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - pytest -s -v tests/quantization/test_blackwell_moe.py -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - mirror_hardwares: [amdexperimental, amdproduction, amdmi355] +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt - -##### 1 GPU test ##### -##### multi gpus test ##### - -- label: Distributed Comm Ops Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 - optional: true - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/distributed - - tests/distributed - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - -- label: 2 Node Tests (4 GPUs in total) # 16min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdmultinode] - agent_pool: mi355_4 - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - num_nodes: 2 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py - commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - -- label: Distributed Tests (2 GPUs) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 - optional: true - # grade: Blocking - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/compilation/ - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/compile/fullgraph/test_basic_correctness.py - - tests/compile/test_wrapper.py - - tests/distributed/ - - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - - export TORCH_NCCL_BLOCKING_WAIT=1 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - -- label: Distributed Model Tests (2 GPUs) # 37min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_2 - optional: true - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/model_executor/model_loader/sharded_state_loader.py - - vllm/model_executor/models/ - - tests/basic_correctness/ - - tests/model_executor/model_loader/test_sharded_state_loader.py - - tests/models/ - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - -- label: Plugin Tests (2 GPUs) # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 - optional: true - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - commands: - # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - # test bge_m3_sparse io_processor plugin - - pip install -e ./plugins/bge_m3_sparse_plugin - - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py - - pip uninstall bge_m3_sparse_plugin -y - # end io_processor plugins test - # begin stat_logger plugins test - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - # end stat_logger plugins test - # other tests continue here: - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - -- label: Pipeline + Context Parallelism Test # 45min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - -- label: LoRA TP Test (Distributed) # 17 min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # There is some Tensor Parallelism related processing logic in LoRA that - # requires multi-GPU testing for validation. - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - # Disabled for now because MXFP4 backend on non-cuda platform - # doesn't support LoRA yet - #- pytest -v -s -x lora/test_gptoss_tp.py - - -- label: Weight Loading Multiple GPU Test # 33min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - -- label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental] - agent_pool: mi355_2 - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - -- label: NixlConnector PD accuracy tests (Distributed) # 30min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - optional: true - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - # grade: Blocking - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/tests" - num_devices: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -##### multi gpus test ##### -##### A100 test ##### - -- label: Distributed Tests (A100) # optional - mirror_hardwares: [amdexperimental] - agent_pool: mi355_4 - gpu: a100 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/ - commands: - # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 - # TODO: Remove when the bug is fixed in a future ROCm release - - export TORCH_NCCL_BLOCKING_WAIT=1 - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - - -- label: LM Eval Large Models # optional - gpu: a100 - optional: true - mirror_hardwares: [amdexperimental] - agent_pool: mi355_4 - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - gpu: h100 - optional: true - mirror_hardwares: [amdexperimental] - agent_pool: mi355_4 - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - - -##### H200 test ##### -- label: Distributed Tests (H200) # optional - mirror_hardwares: [amdexperimental] - agent_pool: mi355_2 - gpu: h200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### B200 test ##### -- label: Distributed Tests (B200) # optional - gpu: b200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### E2E Eval Tests ##### -- label: LM Eval Small Models (1 Card) # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: LM Eval Large Models (4 Card) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -- label: ROCm LM Eval Large Models (8 Card) - mirror_hardwares: [amdproduction] - agent_pool: mi355_8 - optional: true - num_gpus: 8 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 - -- label: ROCm GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/tests" - agent_pool: mi355_1 - mirror_hardwares: [amdexperimental, amdproduction] - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt - -##### EPLB Accuracy Tests ##### -- label: DeepSeek V2-Lite Accuracy - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) - mirror_hardwares: [amdexperimental, amdproduction, amdmi355] - agent_pool: mi355_2 - timeout_in_minutes: 60 - gpu: b200 - optional: true num_gpus: 2 working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/model_executor/model_loader/ + - vllm/model_executor/layers/quantization/ + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/eplb + - vllm/v1/attention/backends/ + - vllm/v1/attention/selector.py + - .buildkite/scripts/scheduled_integration_test/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_4 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 - -- label: Attention Benchmarks Smoke Test (B200-MI355) - device: b200 - mirror_hardwares: [amdexperimental, amdmi355] +- label: Attention Benchmarks Smoke Test (B200-MI355) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 num_gpus: 2 - optional: true working_dir: "/vllm-workspace/" - timeout_in_minutes: 10 source_file_dependencies: - benchmarks/attention_benchmarks/ - vllm/v1/attention/ + - vllm/_aiter_ops.py + - vllm/platforms/rocm.py commands: - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 - diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 5da7b64ac..c21b66552 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -59,7 +59,7 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -s -v tests/compile/passes/distributed -- label: Fusion and Compile Unit Tests (B200) +- label: Fusion and Compile Unit Tests (2xB200) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: b200 diff --git a/tests/evals/gpt_oss/configs/models-gfx942.txt b/tests/evals/gpt_oss/configs/models-gfx942.txt index 48cef0122..60eff507d 100644 --- a/tests/evals/gpt_oss/configs/models-gfx942.txt +++ b/tests/evals/gpt_oss/configs/models-gfx942.txt @@ -1,3 +1,3 @@ # GFX942 model configurations for GPQA evaluation # Tests different environment variable combinations -gpt-oss-20b-rocm-baseline.yaml \ No newline at end of file +gpt-oss-20b-rocm-baseline.yaml diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml new file mode 100644 index 000000000..0171cb4b1 --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml @@ -0,0 +1,12 @@ +model_name: "deepseek-ai/DeepSeek-R1" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --data-parallel-size 8 + --enable-expert-parallel + --attention-backend=TRITON_ATTN + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml new file mode 100644 index 000000000..ef92f574c --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml @@ -0,0 +1,12 @@ +model_name: "deepseek-ai/DeepSeek-R1" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --tensor-parallel-size 8 + --enable-expert-parallel + --attention-backend=TRITON_ATTN + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' diff --git a/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml new file mode 100644 index 000000000..8d207878d --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml @@ -0,0 +1,12 @@ +model_name: "deepseek-ai/DeepSeek-V3.2" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --data-parallel-size 8 + --enable-expert-parallel + --attention-backend=TRITON_ATTN + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' diff --git a/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml new file mode 100644 index 000000000..46853d3f5 --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml @@ -0,0 +1,12 @@ +model_name: "deepseek-ai/DeepSeek-V3.2" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --tensor-parallel-size 8 + --enable-expert-parallel + --attention-backend=TRITON_ATTN + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' diff --git a/tests/evals/gsm8k/configs/models-mi355.txt b/tests/evals/gsm8k/configs/models-mi3xx-quantized.txt similarity index 100% rename from tests/evals/gsm8k/configs/models-mi355.txt rename to tests/evals/gsm8k/configs/models-mi3xx-quantized.txt diff --git a/tests/evals/gsm8k/configs/models-mi3xx.txt b/tests/evals/gsm8k/configs/models-mi3xx.txt new file mode 100644 index 000000000..6cf833b64 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-mi3xx.txt @@ -0,0 +1,4 @@ +DeepSeek-R1-TP_MI325.yaml +DeepSeek-R1-DP_MI325.yaml +DeepSeek-V3.2-TP_MI325.yaml +DeepSeek-V3.2-DP_MI325.yaml diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index c8028c0b8..7e36ea1bd 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -64,6 +64,16 @@ def test_gsm8k_correctness(config_filename): "Marlin kernels are not supported." ) + # TODO(akaratza): Enable DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms + if current_platform.is_rocm() and ( + "deepseek-ai/DeepSeek-V3.2" in eval_config["model_name"] + or "deepseek-ai/DeepSeek-R1" in eval_config["model_name"] + ): + pytest.skip( + "Skipping DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms " + "due to agent pool disk space issues and pod evictions." + ) + # Parse server arguments from config (use shlex to handle quoted strings) server_args_str = eval_config.get("server_args", "") server_args = shlex.split(server_args_str) if server_args_str else [] diff --git a/tests/quantization/test_mi3xx_moe.py b/tests/quantization/test_mi3xx_moe.py new file mode 100644 index 000000000..2f8dfde68 --- /dev/null +++ b/tests/quantization/test_mi3xx_moe.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +def test_mi3xx_moe(): + print("TODO: add tests for Mi3xx MoE quantization") diff --git a/tests/rocm/aiter/test_mla_fp8_support_check.py b/tests/rocm/aiter/test_mla_fp8_support_check.py index e3dc0f8ea..28da59a1a 100644 --- a/tests/rocm/aiter/test_mla_fp8_support_check.py +++ b/tests/rocm/aiter/test_mla_fp8_support_check.py @@ -31,7 +31,7 @@ class TestAiterMlaFp8SupportCheck: # Should return False without raising with patch( - "vllm._aiter_ops.inspect.signature", + "inspect.signature", side_effect=ImportError("No module"), ): result = _check_aiter_mla_fp8_support() @@ -46,7 +46,7 @@ class TestAiterMlaFp8SupportCheck: aiter_ops._AITER_MLA_SUPPORTS_FP8 = None with patch( - "vllm._aiter_ops.inspect.signature", + "inspect.signature", side_effect=ModuleNotFoundError("Module not found"), ): # Should return False without raising @@ -63,7 +63,7 @@ class TestAiterMlaFp8SupportCheck: aiter_ops._AITER_MLA_SUPPORTS_FP8 = None with patch( - "vllm._aiter_ops.inspect.signature", + "inspect.signature", side_effect=AttributeError("No attribute"), ): assert _check_aiter_mla_fp8_support() is False @@ -78,7 +78,7 @@ class TestAiterMlaFp8SupportCheck: aiter_ops._AITER_MLA_SUPPORTS_FP8 = None with patch( - "vllm._aiter_ops.inspect.signature", + "inspect.signature", side_effect=ValueError("No signature"), ): assert _check_aiter_mla_fp8_support() is False @@ -93,7 +93,7 @@ class TestAiterMlaFp8SupportCheck: aiter_ops._AITER_MLA_SUPPORTS_FP8 = None with patch( - "vllm._aiter_ops.inspect.signature", + "inspect.signature", side_effect=TypeError("Not a callable"), ): assert _check_aiter_mla_fp8_support() is False