[Bugfix] Fix inner_dp_world initialization order for multi-node TP (#35892)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Yongye Zhu
2026-03-06 01:04:44 -05:00
committed by GitHub
parent 27066d1b2b
commit 86e1060b17
3 changed files with 11 additions and 3 deletions

View File

@@ -608,7 +608,6 @@ class WorkerProc:
)
# Load model
self._init_message_queues(input_shm_handle, vllm_config)
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
if not is_eep_new_worker:
self.worker.init_device()
@@ -618,6 +617,10 @@ class WorkerProc:
)
self.worker.load_model()
# Initialize message queues after init_device() since multi-node setups
# (nnodes_within_dp > 1) require distributed groups to be initialized
self._init_message_queues(input_shm_handle, vllm_config)
# Enable environment variable cache (e.g. assume no more
# environment variable overrides after this point)
enable_envs_cache()