[Core][Distributed] add same-node detection (#5369)

This commit is contained in:
youkaichao
2024-06-11 10:53:59 -07:00
committed by GitHub
parent dcbf4286af
commit c4bd03c7c5
4 changed files with 87 additions and 1 deletions

View File

@@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
from vllm.distributed.device_communicators.custom_all_reduce_utils import (
gpu_p2p_access_check)
from vllm.distributed.parallel_state import (
get_local_rank, get_tensor_model_parallel_cpu_group)
get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
from vllm.logger import init_logger
try:
@@ -113,6 +113,13 @@ class CustomAllreduce:
assert dist.get_backend(group) != dist.Backend.NCCL, (
"CustomAllreduce should be attached to a non-NCCL group.")
if not is_in_the_same_node(group):
# No need to initialize custom allreduce for multi-node case.
logger.warning(
"Custom allreduce is disabled because this process group"
" spans across nodes.")
return
rank = dist.get_rank(group=self.group)
world_size = dist.get_world_size(group=self.group)
if world_size == 1: