[Core][Distributed] add same-node detection (#5369)
This commit is contained in:
@@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
|
||||
from vllm.distributed.device_communicators.custom_all_reduce_utils import (
|
||||
gpu_p2p_access_check)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_local_rank, get_tensor_model_parallel_cpu_group)
|
||||
get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
@@ -113,6 +113,13 @@ class CustomAllreduce:
|
||||
assert dist.get_backend(group) != dist.Backend.NCCL, (
|
||||
"CustomAllreduce should be attached to a non-NCCL group.")
|
||||
|
||||
if not is_in_the_same_node(group):
|
||||
# No need to initialize custom allreduce for multi-node case.
|
||||
logger.warning(
|
||||
"Custom allreduce is disabled because this process group"
|
||||
" spans across nodes.")
|
||||
return
|
||||
|
||||
rank = dist.get_rank(group=self.group)
|
||||
world_size = dist.get_world_size(group=self.group)
|
||||
if world_size == 1:
|
||||
|
||||
Reference in New Issue
Block a user