[DP] support torchrun external launcher with Data Parallelism (#24899)

Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
Lucia Fang
2025-09-22 12:06:05 -07:00
committed by GitHub
parent 239ef0c1ac
commit 922979bfcc
6 changed files with 202 additions and 7 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import os
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
@@ -351,6 +352,10 @@ class ParallelConfig:
self.world_size = self.pipeline_parallel_size * \
self.tensor_parallel_size
if self.distributed_executor_backend == "external_launcher":
logger.info("Using external launcher for distributed inference.")
self.world_size *= self.data_parallel_size
if self.data_parallel_size_local > self.data_parallel_size:
raise ValueError(
f"data_parallel_size_local ({self.data_parallel_size_local}) "
@@ -358,6 +363,13 @@ class ParallelConfig:
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
# Data parallel was specified in the engine args.
if self.distributed_executor_backend == "external_launcher":
# For external launcher,
# we need to set the data parallel rank automatically
self.data_parallel_rank = int(os.environ["RANK"]) \
// (self.world_size // self.data_parallel_size)
logger.info("Set data_parallel_rank to %d automatically.",
self.data_parallel_rank)
if not self._data_parallel_master_port_list:
self._data_parallel_master_port_list = get_open_ports_list(5)
self.data_parallel_master_port = \
@@ -380,7 +392,6 @@ class ParallelConfig:
"be set when data_parallel_size > 1")
if self.distributed_executor_backend == "external_launcher":
import os
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
logger.info("Disabling V1 multiprocessing for external launcher.")