[Bugfix] Fix MessageQueue connect_ip for cross-node data parallelism (#35429)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
This commit is contained in:
Lucia Fang
2026-02-26 14:08:16 -08:00
committed by GitHub
parent d0105b84f0
commit 0f2f24c8b2
2 changed files with 93 additions and 1 deletions

View File

@@ -44,6 +44,7 @@ from vllm.logger import init_logger
from vllm.tracing import instrument, maybe_init_worker_tracer
from vllm.utils.network_utils import (
get_distributed_init_method,
get_ip,
get_loopback_ip,
get_open_port,
)
@@ -128,11 +129,23 @@ class MultiprocExecutor(Executor):
# For leader node within each dp rank,
# each dp will have its own leader multiproc executor.
max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
mq_connect_ip = get_ip()
logger.info(
"DP group leader: node_rank=%d, node_rank_within_dp=%d, "
"master_addr=%s, mq_connect_ip=%s (local), "
"world_size=%d, local_world_size=%d",
self.parallel_config.node_rank,
self.parallel_config.node_rank_within_dp,
self.parallel_config.master_addr,
mq_connect_ip,
self.world_size,
self.local_world_size,
)
self.rpc_broadcast_mq = MessageQueue(
self.world_size,
self.local_world_size,
max_chunk_bytes=max_chunk_bytes,
connect_ip=self.parallel_config.master_addr,
connect_ip=mq_connect_ip,
)
scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
# Create workers