[CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-25 21:29:00 -04:00
parent 2a97ffc33d
commit 906e461ed6
9 changed files with 40 additions and 12 deletions
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                              tensor_model_parallel_all_reduce,
                              tensor_model_parallel_reduce_scatter)

-from ..utils import init_test_distributed_environment, multi_process_parallel
+from ..utils import (init_test_distributed_environment, multi_gpu_test,
+                     multi_process_parallel)


@ray.remote(num_gpus=1, max_calls=1)
@@ -226,8 +227,7 @@ def send_recv_test_worker(
        torch.testing.assert_close(test_tensor, recv_tensor)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target", [
    all_reduce_test_worker, all_gather_test_worker,
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
    multi_process_parallel(monkeypatch, tp_size, 1, test_target)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
    multi_process_parallel(monkeypatch, 1, pp_size, test_target)


-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize("test_target", [