diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 47658e505..f94f831a4 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -50,24 +50,18 @@ steps: - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Distributed Tests (4 GPUs) - timeout_in_minutes: 50 +- label: Distributed Torchrun + Examples (4 GPUs) + timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_torchrun_example.py + - tests/distributed/test_torchrun_example_moe.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - examples/offline_inference/new_weight_syncing/ - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py - - tests/distributed/test_multiproc_executor.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -85,21 +79,6 @@ steps: - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - # test multi-node TP with multiproc executor (simulated on single node) - - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests # OLD rlhf examples - cd ../examples/offline_inference - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py @@ -109,6 +88,47 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py +- label: Distributed DP Tests (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_utils + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + +- label: Distributed Compile + Comm (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_multiproc_executor.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # test multi-node TP with multiproc executor (simulated on single node) + - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node + - label: Distributed Tests (8 GPUs)(H100) timeout_in_minutes: 10 device: h100