[BugFix] Fix P/D with non-MoE DP (#33037)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
(cherry picked from commit 0cd259b2d8)
This commit is contained in:
Nick Hill
2026-01-27 08:03:47 -08:00
committed by khluu
parent 0d8ce320a2
commit 7779de34da
2 changed files with 18 additions and 11 deletions

View File

@@ -911,6 +911,17 @@ class EngineCoreProc(EngineCore):
set_process_title("EngineCore")
decorate_logs()
if data_parallel and vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
vllm_config.kv_transfer_config.engine_id = (
f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
)
logger.debug(
"Setting kv_transfer_config.engine_id to %s",
vllm_config.kv_transfer_config.engine_id,
)
parallel_config.data_parallel_index = dp_rank
if data_parallel and vllm_config.model_config.is_moe:
# Set data parallel rank for this engine process.
@@ -1285,17 +1296,6 @@ class DPEngineCoreProc(EngineCoreProc):
assert local_dp_rank is not None
assert 0 <= local_dp_rank <= dp_rank < dp_size
if vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
vllm_config.kv_transfer_config.engine_id = (
f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
)
logger.debug(
"Setting kv_transfer_config.engine_id to %s",
vllm_config.kv_transfer_config.engine_id,
)
self.dp_rank = dp_rank
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()

View File

@@ -313,6 +313,13 @@ class CoreEngineActorManager:
dp_vllm_config.parallel_config.placement_group = pg
local_client = index < local_engine_count
if dp_size > 1 and dp_vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
dp_vllm_config.kv_transfer_config.engine_id = (
f"{dp_vllm_config.kv_transfer_config.engine_id}_dp{local_index}"
)
# Ray XPU known issue: dpctl initializes the GPU runtime early, so
# setting device env vars in Ray actor's initialization method
# will not affect device selection. See: