elastic_ep: Fix issues with repeated scale up/down cycles (#37131)

Signed-off-by: Itay Alroy <ialroy@nvidia.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
2026-03-21 01:13:02 +02:00
parent e5ed6c6c13
commit c57d38d603
10 changed files with 129 additions and 90 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -315,30 +315,12 @@ class Worker(WorkerBase):

    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
    # to hijack tensor allocation.
-    def load_model(self) -> None:
-        dummy_weights = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
-        if dummy_weights:
-            (
-                expanded_physical_to_logical,
-                num_logical_experts,
-                old_num_physical_experts,
-            ) = self.elastic_ep_executor.receive_expert_mapping()
-            num_physical_experts = expanded_physical_to_logical.shape[1]
-            self.parallel_config.eplb_config.num_redundant_experts = (
-                num_physical_experts - num_logical_experts
-            )
-
+    def load_model(self, *, load_dummy_weights: bool = False) -> None:
        with (
            self._maybe_get_memory_pool_context(tag="weights"),
            set_current_vllm_config(self.vllm_config),
        ):
-            self.model_runner.load_model(load_dummy_weights=dummy_weights)
-
-        if dummy_weights:
-            self.model_runner.setup_eplb_from_mapping(
-                expanded_physical_to_logical, old_num_physical_experts
-            )
-            self.model_runner.eep_eplb_suppressed = True
+            self.model_runner.load_model(load_dummy_weights=load_dummy_weights)

    def update_config(self, overrides: dict[str, Any]) -> None:
        self.model_runner.update_config(overrides)