[Core][Distributed] code deduplication in tp&pp with coordinator(#5293)

[Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)
2024-06-12 17:27:08 -07:00
parent 2135cacb45
commit ea3890a5f0
12 changed files with 625 additions and 585 deletions
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -11,7 +11,6 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger

 logger = init_logger(__name__)
@@ -162,7 +161,8 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
    )
    os.makedirs(os.path.dirname(path), exist_ok=True)
-    if ((not is_distributed or get_local_rank() == 0)
+    from vllm.distributed.parallel_state import get_world_group
+    if ((not is_distributed or get_world_group().local_rank == 0)
            and (not os.path.exists(path))):
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
@@ -174,8 +174,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
        with open(path, "w") as f:
            json.dump(cache, f, indent=4)
    if is_distributed:
-        cpu_world_group = get_cpu_world_group()
-        dist.barrier(cpu_world_group)
+        get_world_group().barrier()
    logger.info("reading GPU P2P access cache from %s", path)
    with open(path, "r") as f:
        cache = json.load(f)