[Bugfix] Fix inner_dp_world initialization order for multi-node TP (#35892)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -608,7 +608,6 @@ class WorkerProc:
|
||||
)
|
||||
|
||||
# Load model
|
||||
self._init_message_queues(input_shm_handle, vllm_config)
|
||||
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
|
||||
if not is_eep_new_worker:
|
||||
self.worker.init_device()
|
||||
@@ -618,6 +617,10 @@ class WorkerProc:
|
||||
)
|
||||
self.worker.load_model()
|
||||
|
||||
# Initialize message queues after init_device() since multi-node setups
|
||||
# (nnodes_within_dp > 1) require distributed groups to be initialized
|
||||
self._init_message_queues(input_shm_handle, vllm_config)
|
||||
|
||||
# Enable environment variable cache (e.g. assume no more
|
||||
# environment variable overrides after this point)
|
||||
enable_envs_cache()
|
||||
|
||||
Reference in New Issue
Block a user