[Core][Bugfix] Use correct device to initialize GPU data during CUDA-graph-capture (#11233)

Signed-off-by: Yan Burman <yanburman@users.noreply.github.com> Signed-off-by: Ido Asraff <idoa@atero.ai>
2025-01-04 08:50:16 +02:00
parent d91457d529
commit 300acb8347
5 changed files with 23 additions and 15 deletions
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -107,7 +107,7 @@ def multiple_allreduce_with_vllm_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
    ensure_model_parallel_initialized(2, 2)
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with graph_capture():
+    with graph_capture(device=device):
        # two tp groups can communicate independently
        if torch.distributed.get_rank() in [0, 1]:
            tensor = tensor_model_parallel_all_reduce(tensor)