[Bugfix] Fix MessageQueue connect_ip for cross-node data parallelism (#35429)
Signed-off-by: Lu Fang <fanglu@fb.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
This commit is contained in:
@@ -44,6 +44,7 @@ from vllm.logger import init_logger
|
||||
from vllm.tracing import instrument, maybe_init_worker_tracer
|
||||
from vllm.utils.network_utils import (
|
||||
get_distributed_init_method,
|
||||
get_ip,
|
||||
get_loopback_ip,
|
||||
get_open_port,
|
||||
)
|
||||
@@ -128,11 +129,23 @@ class MultiprocExecutor(Executor):
|
||||
# For leader node within each dp rank,
|
||||
# each dp will have its own leader multiproc executor.
|
||||
max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
|
||||
mq_connect_ip = get_ip()
|
||||
logger.info(
|
||||
"DP group leader: node_rank=%d, node_rank_within_dp=%d, "
|
||||
"master_addr=%s, mq_connect_ip=%s (local), "
|
||||
"world_size=%d, local_world_size=%d",
|
||||
self.parallel_config.node_rank,
|
||||
self.parallel_config.node_rank_within_dp,
|
||||
self.parallel_config.master_addr,
|
||||
mq_connect_ip,
|
||||
self.world_size,
|
||||
self.local_world_size,
|
||||
)
|
||||
self.rpc_broadcast_mq = MessageQueue(
|
||||
self.world_size,
|
||||
self.local_world_size,
|
||||
max_chunk_bytes=max_chunk_bytes,
|
||||
connect_ip=self.parallel_config.master_addr,
|
||||
connect_ip=mq_connect_ip,
|
||||
)
|
||||
scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
|
||||
# Create workers
|
||||
|
||||
Reference in New Issue
Block a user