diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ecbf1a878..b0a7ba8aa 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,1524 +1,8 @@ -# In this file, you can add more tests to run either by adding a new step or -# adding a new command to an existing step. See different options here for examples. +# This file has been deprecated as of Feb 18, 2026. The content has already been migrated to: -# This script will be feed into Jinja template in `test-template-aws.j2` at -# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 -# to generate the final pipeline yaml file. +# .buildkite/test_areas for test jobs +# .buildkite/image_build for image building jobs +# .buildkite/hardware_tests for jobs running on other hardwares (Intel, Ascend NPU, Arm, etc..) +# .buildkite/ci_config.yaml for configuration of CI pipeline -# Documentation -# label(str): the name of the test. emojis allowed. -# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. -# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. -# fast_check_only(bool): run this test on the fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. -# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). -# command(str): the single command to run for tests. incompatible with commands. -# commands(list): the list of commands to run for the test. incompatible with command. -# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] -# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 -# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. -# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, -# in this case, commands must be specified. the first command runs on the first host, the second -# command runs on the second host. -# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. -# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB -# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. -# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests -# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. -# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch. - -# When adding a test -# - If the test belongs to an existing group, add it there -# - If the test is short, add to any existing step -# - If the test takes more than 10min, then it is okay to create a new step. -# Note that all steps execute in parallel. - -steps: -##### fast check tests ##### - -- label: Pytorch Nightly Dependency Override Check # 2min - # if this test fails, it means the nightly torch version is not compatible with some - # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/pre_commit/generate_nightly_torch_test.py - soft_fail: true - source_file_dependencies: - - requirements/nightly_torch_test.txt - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Async Engine, Inputs, Utils, Worker Test # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/detokenizer - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s detokenizer - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - no_gpu: true - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config - -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - -- label: Entrypoints Unit Tests # 5min - timeout_in_minutes: 10 - working_dir: "/vllm-workspace/tests" - fast_check: true - source_file_dependencies: - - vllm/entrypoints - - tests/entrypoints/ - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - -- label: Entrypoints Integration Test (LLM) # 30min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/llm - - tests/entrypoints/offline_mode - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - -- label: Entrypoints Integration Test (API Server 1) # 100min - timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/instrumentator --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - - pytest -v -s entrypoints/test_chat_utils.py - -- label: Entrypoints Integration Test (API Server 2) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/instrumentator - - tests/tool_use - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use - -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/pooling - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling - -- label: Entrypoints Integration Test (Responses API) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai/responses - commands: - - pytest -v -s entrypoints/openai/responses - -- label: Distributed Tests (4 GPUs) # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - - examples/offline_inference/new_weight_syncing/ - - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - # test with torchrun tp=2 and external_dp=2 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=2 and pp=2 - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=4 and dp=1 - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2, pp=2 and dp=1 - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=1 and dp=4 with ep - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2 and dp=2 with ep - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with internal dp - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - # OLD rlhf examples - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - # NEW rlhf examples - - pushd ../examples/offline_inference/new_weight_syncing - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - - popd - -- label: Distributed Tests (8 GPUs) # 4min - timeout_in_minutes: 10 - gpu: h100 - num_gpus: 8 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py - - vllm/config/parallel.py - - vllm/distributed/ - - vllm/v1/engine/llm_engine.py - - vllm/v1/executor/uniproc_executor.py - - vllm/v1/worker/gpu_worker.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - -- label: EPLB Algorithm Test # 5min - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_algo.py - commands: - - pytest -v -s distributed/test_eplb_algo.py - -- label: EPLB Execution Test # 10min - timeout_in_minutes: 20 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_execute.py - commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - -- label: Metrics, Tracing Test # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/v1/tracing - commands: - - "pip install \ - 'opentelemetry-sdk>=1.26.0' \ - 'opentelemetry-api>=1.26.0' \ - 'opentelemetry-exporter-otlp>=1.26.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s v1/tracing - -##### fast check tests ##### -##### 1 GPU test ##### - -- label: Regression Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: Engine Test # 9min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - -- label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - # Run this test standalone for now; - # need to untangle use (implicit) use of spawn/fork across the tests. - - pytest -v -s v1/engine/test_preprocess_error_handling.py - - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - -- label: V1 Test entrypoints # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - -- label: V1 Test others # 42min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - # split the test to avoid interference - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s -m 'not slow_test' v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -- label: V1 Test attention (H100) # 10min - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - -- label: Batch Invariance Tests (H100) # 10min - timeout_in_minutes: 25 - gpu: h100 - source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - -- label: V1 Test attention (B200) # 10min - timeout_in_minutes: 30 - gpu: b200 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - -- label: V1 Test others (CPU) # 5 mins - source_file_dependencies: - - vllm/ - - tests/v1 - no_gpu: true - commands: - # split the test to avoid interference - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - - -- label: Examples Test # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/examples" - source_file_dependencies: - - vllm/entrypoints - - vllm/multimodal - - examples/ - commands: - - pip install tensorizer # for tensorizer test - # for basic - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py - # for multi-modal models - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - # for pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # for features demo - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - -- label: Platform Tests (CUDA) # 4min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/cuda - commands: - - pytest -v -s cuda/test_cuda_context.py - -- label: Samplers Test # 56min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - tests/samplers - - tests/conftest.py - commands: - - pytest -v -s samplers - - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - -- label: LoRA Test %N # 20min each - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py - - parallelism: 4 - -- label: PyTorch Compilation Unit Tests # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run unit tests defined directly under compile/, - # not including subdirectories, which are usually heavier - # tests covered elsewhere. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - # However, find does not normally propagate error codes, so we combine it with xargs - # (using -0 for proper path handling) - - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - - pytest -s -v compile/passes --ignore compile/passes/distributed - -- label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run smoke tests under fullgraph directory, except test_full_graph.py - # as it is a heavy test that is covered in other steps. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - # However, find does not normally propagate error codes, so we combine it with xargs - # (using -0 for proper path handling) - - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # # Limit to no custom ops to reduce running time - # # Wrap with quotes to escape yaml and avoid starting -k string with a - - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - -- label: Cudagraph test - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - tests/v1/cudagraph - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - vllm/compilation - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - -- label: Kernels Core Operation Test # 48min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - tests/kernels/core - - tests/kernels/test_top_k_per_row.py - commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py - -- label: Kernels Attention Test %N # 23min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/attention/ - - vllm/v1/attention - # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - - vllm/model_executor/layers/attention - - tests/kernels/attention - commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels Quantization Test %N # 64min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/layers/quantization - - tests/kernels/quantization - commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels MoE Test %N # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/quantization/cutlass_w8a8/moe/ - - csrc/moe/ - - tests/kernels/moe - - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/device_communicators/ - - vllm/envs.py - - vllm/config - commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels Mamba Test # 31min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/mamba/ - - tests/kernels/mamba - - vllm/model_executor/layers/mamba/ops - commands: - - pytest -v -s kernels/mamba - -- label: Kernels DeepGEMM Test (H100) - timeout_in_minutes: 45 - gpu: h100 - num_gpus: 1 - source_file_dependencies: - - tools/install_deepgemm.sh - - vllm/utils/deep_gemm.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization - - tests/kernels/quantization/test_block_fp8.py - - tests/kernels/moe/test_deepgemm.py - - tests/kernels/moe/test_batched_deepgemm.py - - tests/kernels/attention/test_deepgemm_attention.py - commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py - -- label: Kernels Helion Test - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/utils/import_utils.py - - tests/kernels/helion/ - commands: - - pip install helion - - pytest -v -s kernels/helion/ - - -- label: Kernels FP8 MoE Test (1 H100) - timeout_in_minutes: 90 - gpu: h100 - num_gpus: 1 - optional: true - commands: - - pytest -v -s kernels/moe/test_cutlass_moe.py - - pytest -v -s kernels/moe/test_flashinfer.py - - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py - - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py - - pytest -v -s kernels/moe/test_moe.py - # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main - - pytest -v -s kernels/moe/test_block_int8.py - - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py - - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py - -- label: Kernels FP8 MoE Test (2 H100s) - timeout_in_minutes: 90 - gpu: h100 - num_gpus: 2 - optional: true - commands: - - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py - - pytest -v -s kernels/moe/test_deepep_moe.py - - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py - # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main - -- label: Kernels Fp4 MoE Test (B200) - timeout_in_minutes: 60 - gpu: b200 - num_gpus: 1 - optional: true - commands: - - pytest -v -s kernels/moe/test_cutedsl_moe.py - - pytest -v -s kernels/moe/test_flashinfer_moe.py - - pytest -v -s kernels/moe/test_nvfp4_moe.py - - pytest -v -s kernels/moe/test_ocp_mx_moe.py - - -- label: Model Executor Test # 23min - timeout_in_minutes: 35 - torch_nightly: true - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/engine/arg_utils.py - - vllm/config/model.py - - vllm/model_executor - - tests/model_executor - - tests/entrypoints/openai/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - -- label: Benchmarks # 11min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/.buildkite" - source_file_dependencies: - - benchmarks/ - commands: - - bash scripts/run-benchmarks.sh - -- label: Benchmarks CLI Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/benchmarks/ - commands: - - pytest -v -s benchmarks/ - -- label: Quantization Test # 70min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - tests/quantization - commands: - # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release, and pin a working version of torchao nightly here - - # since torchao nightly is only compatible with torch nightly currently - # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now - # we can only upgrade after this is resolved - # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - -- label: LM Eval Small Models # 53min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - autorun_on_main: true - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: OpenAI API correctness # 22min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - commands: # LMEval+Transcription WER check - - pytest -s entrypoints/openai/correctness/ - -##### models test ##### - -- label: Basic Models Tests (Initialization) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_initialization.py - - tests/models/registry.py - commands: - # Run a subset of model initialization tests - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - -- label: Basic Models Tests (Extra Initialization) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/transformers_utils/ - - tests/models/test_initialization.py - - tests/models/registry.py - commands: - # Only when vLLM model source is modified - test initialization of a large - # subset of supported models (the complement of the small subset in the above - # test.) Also run if model initialization test file is modified - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Basic Models Tests (Other) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_terratorch.py - - tests/models/test_transformers.py - - tests/models/test_registry.py - commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - -- label: Basic Models Test (Other CPU) # 5min - timeout_in_minutes: 10 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_utils.py - - tests/models/test_vision.py - no_gpu: true - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - -- label: Language Models Tests (Standard) - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/language - commands: - # Test standard language models, excluding a subset of slow tests - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' - -- label: Language Models Tests (Extra Standard) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - tests/models/language/pooling/test_embedding.py - - tests/models/language/generation/test_common.py - - tests/models/language/pooling/test_classification.py - commands: - # Shard slow subset of standard language models tests. Only run when model - # source is modified, or when specified test files are modified - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Language Models Tests (Hybrid) %N - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - # Shard hybrid language model tests - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Language Models Test (Extended Generation) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - -- label: Language Models Test (PPL) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test - -- label: Language Models Test (Extended Pooling) # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling - commands: - - pytest -v -s models/language/pooling -m 'not core_model' - -- label: Language Models Test (MTEB) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling_mteb_test - commands: - - pytest -v -s models/language/pooling_mteb_test - -- label: Multi-Modal Processor Test (CPU) - timeout_in_minutes: 60 - source_file_dependencies: - - vllm/ - - tests/models/multimodal - no_gpu: true - commands: - - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'" - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Processor Test - timeout_in_minutes: 60 - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 80 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - -- label: Multi-Modal Accuracy Eval (Small Models) # 50min - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - -- label: Multi-Modal Models Test (Extended) 1 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - -- label: Multi-Modal Models Test (Extended) 2 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - -- label: Multi-Modal Models Test (Extended) 3 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - -- label: Quantized Models Test # 45 min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/model_executor/layers/quantization - - tests/models/quantization - commands: - - pytest -v -s models/quantization - -# This test is used only in PR development phase to test individual models and should never run on main -- label: Custom Models Test - mirror_hardwares: [amdexperimental] - optional: true - commands: - - echo 'Testing custom models...' - # PR authors can temporarily add commands below to test individual models - # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py - # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* - -- label: Transformers Nightly Models Test - working_dir: "/vllm-workspace/" - optional: true - soft_fail: true - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - # Whisper needs spawn method to avoid deadlock - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - -- label: Blackwell Test # 23 min - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/attention/backends/mla/cutlass_mla.py - - vllm/v1/attention/backends/mla/flashinfer_mla.py - - vllm/v1/attention/selector.py - - vllm/platforms/cuda.py - commands: - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - # e2e - - pytest -v -s tests/models/quantization/test_nvfp4.py - -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusion_attn.py - - tests/compile/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py - commands: - - nvidia-smi - - pytest -v -s tests/compile/test_fusion_attn.py - - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - -- label: Blackwell GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -- label: Blackwell Quantized MoE Test - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - tests/quantization/test_blackwell_moe.py - - vllm/model_executor/models/deepseek_v2.py - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/models/llama4.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization/compressed_tensors - - vllm/model_executor/layers/quantization/modelopt.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - pytest -s -v tests/quantization/test_blackwell_moe.py - -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - -##### 1 GPU test ##### -##### multi gpus test ##### - -- label: Distributed Comm Ops Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/distributed - - tests/distributed - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - - pytest -v -s distributed/test_packed_tensor.py - - pytest -v -s distributed/test_weight_transfer.py - -- label: 2 Node Tests (4 GPUs in total) # 16min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - num_nodes: 2 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py - - .buildkite/scripts/run-multi-node-test.sh - commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - -- label: Distributed Tests (2 GPUs) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/compilation/ - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/compile/fullgraph/test_basic_correctness.py - - tests/compile/test_wrapper.py - - tests/distributed/ - - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - -- label: Distributed Model Tests (2 GPUs) # 37min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/model_executor/model_loader/sharded_state_loader.py - - vllm/model_executor/models/ - - tests/basic_correctness/ - - tests/model_executor/model_loader/test_sharded_state_loader.py - - tests/models/ - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - -- label: Plugin Tests (2 GPUs) # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - commands: - # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - # end io_processor plugins test - # begin stat_logger plugins test - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - # end stat_logger plugins test - # other tests continue here: - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins - -- label: Pipeline + Context Parallelism Test # 45min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - -- label: LoRA TP Test (Distributed) # 17 min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - num_gpus: 4 - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # Alot of these tests are on the edge of OOMing - - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True - # There is some Tensor Parallelism related processing logic in LoRA that - # requires multi-GPU testing for validation. - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py - - -- label: Weight Loading Multiple GPU Test # 33min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - -- label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - gpu: a100 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - -- label: NixlConnector PD accuracy tests (Distributed) # 40min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - - -##### multi gpus test ##### -##### A100 test ##### - -- label: Distributed Tests (A100) # optional - gpu: a100 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/ - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - -- label: Acceptance Length Test (Large Models) # optional - timeout_in_minutes: 120 - gpu: h100 - optional: true - num_gpus: 1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/model_executor/models/mlp_speculator.py - - tests/v1/spec_decode/test_acceptance_length.py - commands: - - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 - - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test - -- label: LM Eval Large Models # optional - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - -- label: Sequence Parallel Tests (H100) # 60 min - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: h100 - optional: true - num_gpus: 2 - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # Run sequence parallel tests - - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - -- label: Distributed Tests (H100) # optional - gpu: h100 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### H200 test ##### - -- label: LM Eval Large Models (H200) # optional - timeout_in_minutes: 60 - gpu: h200 - optional: true - num_gpus: 8 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt - -##### B200 test ##### -- label: Distributed Tests (B200) # optional - gpu: b200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min - timeout_in_minutes: 30 - optional: true - soft_fail: true - num_gpus: 2 - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh - commands: - - nvidia-smi - - bash .buildkite/scripts/run-prime-rl-test.sh - -- label: DeepSeek V2-Lite Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - timeout_in_minutes: 60 - gpu: b200 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -##### MoE Refactor (Temporary) Tests ##### - -- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional - gpu: h100 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt - -- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional - gpu: b200 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt - -- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional - gpu: b200 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +# If you need to make changes to CI, please find the relevant file in these directories and make changes there.