[core][distributed] add pynccl broadcast (#10843)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-02 20:53:23 -08:00
parent a4cf256159
commit 21fe7b481a
3 changed files with 78 additions and 2 deletions
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -61,6 +61,7 @@ def worker_fn():
                        dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
        tensor = pynccl_comm.all_reduce(tensor)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    assert result == pynccl_comm.world_size

@@ -86,10 +87,12 @@ def multiple_allreduce_worker_fn():
        if torch.distributed.get_rank() in [0, 1]:
            tensor = pynccl_comm.all_reduce(tensor)
            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 4
        else:
            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 2

@@ -112,10 +115,12 @@ def multiple_allreduce_with_vllm_worker_fn():
        if torch.distributed.get_rank() in [0, 1]:
            tensor = tensor_model_parallel_all_reduce(tensor)
            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 4
        else:
            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 2

@@ -141,9 +146,9 @@ def worker_fn_with_cudagraph():
                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                    enable=True):
            a_out = pynccl_comm.all_reduce(a)
-        pynccl_comm.stream.synchronize()
+        torch.cuda.synchronize()
        graph.replay()
-        pynccl_comm.stream.synchronize()
+        torch.cuda.synchronize()
        assert a_out.mean().cpu().item() == pynccl_comm.world_size**1


@@ -170,6 +175,7 @@ def all_gather_worker_fn():

    with pynccl_comm.change_state(enable=True):
        pynccl_comm.all_gather(result, tensor)
+    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


@@ -207,6 +213,7 @@ def reduce_scatter_worker_fn():

    with pynccl_comm.change_state(enable=True):
        pynccl_comm.reduce_scatter(result, tensor)
+    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


@@ -241,6 +248,7 @@ def send_recv_worker_fn():
            pynccl_comm.recv(tensor,
                             src=(pynccl_comm.rank - 1) %
                             pynccl_comm.world_size)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    assert result == 1

@@ -280,6 +288,7 @@ def multiple_send_recv_worker_fn():
            pynccl_comm.recv(tensor,
                             src=(pynccl_comm.rank - 1) %
                             pynccl_comm.world_size)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    if torch.distributed.get_rank() in [0, 2]:
        assert result == 1
@@ -293,6 +302,38 @@ def test_pynccl_multiple_send_recv():
    distributed_run(multiple_send_recv_worker_fn, 4)


+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def broadcast_worker_fn():
+    # Test broadcast for every root rank.
+    # Essentially this is an all-gather operation.
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    recv_tensors = [
+        torch.empty(16,
+                    1024,
+                    1024,
+                    dtype=torch.float32,
+                    device=pynccl_comm.device)
+        for i in range(pynccl_comm.world_size)
+    ]
+    recv_tensors[pynccl_comm.rank] = torch.ones(
+        16, 1024, 1024, dtype=torch.float32,
+        device=pynccl_comm.device) * pynccl_comm.rank
+
+    for i in range(pynccl_comm.world_size):
+        pynccl_comm.broadcast(recv_tensors[i], src=i)
+        # the broadcast op might be launched in a different stream
+        # need to synchronize to make sure the tensor is ready
+        torch.cuda.synchronize()
+        assert torch.all(recv_tensors[i] == i).cpu().item()
+
+
 def test_ncclGetUniqueId():
    lib = NCCLLibrary()
    unique_id = lib.ncclGetUniqueId()