[BugFix] Support online dense model DP without overhead (#30739)
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: njhill <nickhill123@gmail.com>
This commit is contained in:
@@ -179,22 +179,20 @@ class Worker(WorkerBase):
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
def init_device(self):
|
||||
device = self.device_config.device
|
||||
if isinstance(device, torch.device) and device.type == "cuda":
|
||||
if self.device_config.device_type == "cuda":
|
||||
# This env var set by Ray causes exceptions with graph building.
|
||||
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
|
||||
parallel_config = self.parallel_config
|
||||
if (
|
||||
self.parallel_config.data_parallel_size > 1
|
||||
and self.parallel_config.data_parallel_size_local > 0
|
||||
and self.parallel_config.distributed_executor_backend
|
||||
not in ["ray", "external_launcher"]
|
||||
and self.vllm_config.parallel_config.data_parallel_backend != "ray"
|
||||
and self.vllm_config.parallel_config.nnodes_within_dp == 1
|
||||
parallel_config.distributed_executor_backend
|
||||
not in ("ray", "external_launcher")
|
||||
and parallel_config.data_parallel_backend != "ray"
|
||||
and parallel_config.nnodes_within_dp == 1
|
||||
):
|
||||
# Use local DP rank if available, otherwise use global DP rank.
|
||||
dp_local_rank = self.parallel_config.data_parallel_rank_local
|
||||
if dp_local_rank is None:
|
||||
dp_local_rank = self.parallel_config.data_parallel_rank
|
||||
dp_local_rank = self.parallel_config.data_parallel_index
|
||||
|
||||
tp_pp_world_size = (
|
||||
self.parallel_config.pipeline_parallel_size
|
||||
|
||||
Reference in New Issue
Block a user