group: Distributed depends_on: - image-build steps: - label: Distributed Comm Ops timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - vllm/distributed - tests/distributed commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_buffer.py - pytest -v -s distributed/test_shm_storage.py - label: Distributed (2 GPUs) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - vllm/compilation/ - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - tests/v1/distributed - tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Distributed Tests (4 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/ - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - examples/offline_inference/new_weight_syncing/ - tests/examples/offline_inference/data_parallel.py - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=4 and dp=1 - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=2, pp=2 and dp=1 - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=1 and dp=4 with ep - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests # OLD rlhf examples - cd ../examples/offline_inference - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py # NEW rlhf examples - cd new_weight_syncing - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - label: Distributed Tests (8 GPUs)(H100) timeout_in_minutes: 10 device: h100 num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - examples/offline_inference/torchrun_dp_example.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py - vllm/v1/executor/uniproc_executor.py - vllm/v1/worker/gpu_worker.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) device: a100 optional: true num_devices: 4 source_file_dependencies: - vllm/ commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py - label: Distributed Tests (2 GPUs)(H100) timeout_in_minutes: 15 device: h100 optional: true working_dir: "/vllm-workspace/" num_devices: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) device: b200 optional: true working_dir: "/vllm-workspace/" num_devices: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py - label: 2 Node Test (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 2 num_nodes: 2 no_plugin: true optional: true # TODO: revert once infra issue solved source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Pipeline + Context Parallelism (4 GPUs) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py