diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 2831bbc9d..23a23723a 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -10,7 +10,7 @@ steps: docker build --build-arg max_jobs=16 --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950' + --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" -f docker/Dockerfile.rocm diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ecc062046..39f7d4d66 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -33,6 +33,1335 @@ # Note that all steps execute in parallel. steps: + + +##################################################################################################################################### +# # +# MI250 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately) # +# # +##################################################################################################################################### + +- label: Pytorch Nightly Dependency Override Check # 2min + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/detokenizer + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s detokenizer + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/test_pooling_params.py + - tests/multimodal + - tests/renderers + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s test_pooling_params.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s renderers + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode + +- label: Entrypoints Integration Test (API Server 1) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/rpc + - tests/entrypoints/instrumentator + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Entrypoints Integration Test (Responses API) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai/responses + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/responses + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - examples/offline_inference/new_weight_syncing/ + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + - pushd ../examples/offline_inference/new_weight_syncing + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py + - popd + +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_8 + optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: EPLB Algorithm Test # 5min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + +- label: Engine Test # 9min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test e2e (2 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" + +- label: V1 Test e2e (4 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Test attention (H100) # 10min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: Batch Invariance Tests (H100) # 10min + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + +- label: V1 Test others (CPU) # 5 mins + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer + - python3 offline_inference/basic/chat.py + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + +- label: LoRA Test %N # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + parallelism: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Compilation Passes Unit Tests + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/compile/passes + commands: + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + +- label: Cudagraph test # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + parallelism: 2 + source_file_dependencies: + - csrc/attention/ + - vllm/v1/attention + - vllm/model_executor/layers/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + parallelism: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + parallelism: 2 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels Helion Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/utils/import_utils.py + - tests/kernels/helion/ + commands: + - pip install helion + - pytest -v -s kernels/helion/ + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + torch_nightly: true + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + - uv pip install --system torchao==0.14.1 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: OpenAI API correctness # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + - tools/ + commands: + - bash ../tools/install_torchcodec_rocm.sh || exit 1 + - pytest -s entrypoints/openai/correctness/ + +- label: Basic Models Tests (Initialization) # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/transformers_utils/ + - tests/models/test_initialization.py + commands: + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Basic Models Tests (Other) # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_terratorch.py + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) # 18min + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N # 27min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Language Models Tests (Hybrid) %N # 50min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test (CPU) # 15min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 100 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Accuracy Eval (Small Models) # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + +- label: Multi-Modal Models Test (Extended) 1 # 60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 #60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 # 75min + timeout_in_minutes: 150 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +- label: Transformers Nightly Models Test # 60 min + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - python3 examples/offline_inference/basic/chat.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdmultinode, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 2 + num_nodes: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - examples/offline_inference/new_weight_syncing/ + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py + - pytest -v -s models/test_oot_registration.py + - pytest -v -s plugins/lora_resolvers + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + +- label: NixlConnector PD accuracy tests (Distributed) # 30min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: Distributed Tests (A100) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (H100) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 + +- label: Distributed Tests (H200) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/" + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + - pytest -v -s tests/distributed/test_context_parallel.py + - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Large Models (4 Card) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: ROCm LM Eval Large Models (8 Card) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_8 + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval # 80min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi250_1 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: DeepSeek V2-Lite Accuracy # 70min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 70min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + + +################################################### +# # +# MI325 test definitions # +# # +################################################### + + ##### fast check tests ##### - label: Pytorch Nightly Dependency Override Check # 2min