[Core][Distributed] code deduplication in tp&pp with coordinator(#5293)
[Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)
This commit is contained in:
@@ -9,8 +9,7 @@ import vllm.envs as envs
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.distributed.device_communicators.custom_all_reduce_utils import (
|
||||
gpu_p2p_access_check)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
|
||||
from vllm.distributed.parallel_state import is_in_the_same_node
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
@@ -86,8 +85,8 @@ class CustomAllreduce:
|
||||
|
||||
# max_size: max supported allreduce size
|
||||
def __init__(self,
|
||||
group: Optional[ProcessGroup] = None,
|
||||
device: Optional[Union[int, str, torch.device]] = None,
|
||||
group: ProcessGroup,
|
||||
device: Union[int, str, torch.device],
|
||||
max_size=8192 * 1024) -> None:
|
||||
"""
|
||||
Args:
|
||||
@@ -107,7 +106,6 @@ class CustomAllreduce:
|
||||
# e.g. in a non-cuda environment
|
||||
return
|
||||
|
||||
group = group or get_tensor_model_parallel_cpu_group()
|
||||
self.group = group
|
||||
|
||||
assert dist.get_backend(group) != dist.Backend.NCCL, (
|
||||
@@ -134,10 +132,7 @@ class CustomAllreduce:
|
||||
world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
|
||||
return
|
||||
|
||||
if device is None:
|
||||
local_rank = get_local_rank()
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
elif isinstance(device, int):
|
||||
if isinstance(device, int):
|
||||
device = torch.device(f"cuda:{device}")
|
||||
elif isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
|
||||
@@ -11,7 +11,6 @@ import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -162,7 +161,8 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
|
||||
f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
|
||||
)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
if ((not is_distributed or get_local_rank() == 0)
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
if ((not is_distributed or get_world_group().local_rank == 0)
|
||||
and (not os.path.exists(path))):
|
||||
# only the local master process (with local_rank == 0) can
|
||||
# enter this block to calculate the cache
|
||||
@@ -174,8 +174,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
|
||||
with open(path, "w") as f:
|
||||
json.dump(cache, f, indent=4)
|
||||
if is_distributed:
|
||||
cpu_world_group = get_cpu_world_group()
|
||||
dist.barrier(cpu_world_group)
|
||||
get_world_group().barrier()
|
||||
logger.info("reading GPU P2P access cache from %s", path)
|
||||
with open(path, "r") as f:
|
||||
cache = json.load(f)
|
||||
|
||||
@@ -9,7 +9,6 @@ from torch.distributed import ProcessGroup, ReduceOp
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import (
|
||||
NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
|
||||
ncclRedOpTypeEnum, ncclUniqueId)
|
||||
from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -19,8 +18,8 @@ class PyNcclCommunicator:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
group: Optional[ProcessGroup] = None,
|
||||
device: Optional[Union[int, str, torch.device]] = None,
|
||||
group: ProcessGroup,
|
||||
device: Union[int, str, torch.device],
|
||||
library_path: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
@@ -35,7 +34,6 @@ class PyNcclCommunicator:
|
||||
is bind to a unique device.
|
||||
"""
|
||||
assert dist.is_initialized()
|
||||
group = get_cpu_world_group() if group is None else group
|
||||
assert dist.get_backend(group) != dist.Backend.NCCL, (
|
||||
"PyNcclCommunicator should be attached to a non-NCCL group.")
|
||||
self.group = group
|
||||
@@ -77,10 +75,7 @@ class PyNcclCommunicator:
|
||||
byte_list = tensor.tolist()
|
||||
for i, byte in enumerate(byte_list):
|
||||
self.unique_id.internal[i] = byte
|
||||
if device is None:
|
||||
local_rank = get_local_rank()
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
elif isinstance(device, int):
|
||||
if isinstance(device, int):
|
||||
device = torch.device(f"cuda:{device}")
|
||||
elif isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
|
||||
Reference in New Issue
Block a user