elastic_ep: Fix issues with repeated scale up/down cycles (#37131)

Signed-off-by: Itay Alroy <ialroy@nvidia.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
2026-03-21 01:13:02 +02:00
parent e5ed6c6c13
commit c57d38d603
10 changed files with 129 additions and 90 deletions
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -602,13 +602,14 @@ class WorkerProc:
        )

        # Load model
-        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
-        if not is_eep_new_worker:
-            self.worker.init_device()
-            # Update process title now that parallel groups are initialized
-            self.setup_proc_title_and_log_prefix(
-                enable_ep=vllm_config.parallel_config.enable_expert_parallel
-            )
+        self.worker.init_device()
+        # Update process title now that parallel groups are initialized
+        self.setup_proc_title_and_log_prefix(
+            enable_ep=vllm_config.parallel_config.enable_expert_parallel
+        )
+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self.worker.elastic_ep_execute("load_model")
+        else:
            self.worker.load_model()

        scheduler_config = vllm_config.scheduler_config