[core][distributed] zmq fallback for broadcasting large objects (#6183)

[core][distributed] add zmq fallback for broadcasting large objects (#6183)
2024-07-09 18:49:11 -07:00
parent 2416b26e11
commit da78caecfa
6 changed files with 274 additions and 80 deletions
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -2,10 +2,11 @@ import os

 import torch

-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as

 torch.distributed.init_process_group(backend="gloo")
-test_result = is_in_the_same_node(torch.distributed.group.WORLD)
+test_result = all(
+    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))

 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"