[V1][DP] More robust DP/EP dummy request coordination (#16277)

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-22 19:12:15 -07:00
parent bc7c4d206b
commit 1e013fa388
4 changed files with 94 additions and 57 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -325,7 +325,7 @@ class EngineCoreProc(EngineCore):

        self.step_fn = (self.step if self.batch_queue is None else
                        self.step_with_batch_queue)
-        self.global_unfinished_reqs = False
+        self.engines_running = False

        # Background Threads and Queues for IO. These enable us to
        # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -410,8 +410,7 @@ class EngineCoreProc(EngineCore):
        """Exits when an engine step needs to be performed."""

        waited = False
-        while not self.global_unfinished_reqs and not (
-                self.scheduler.has_requests()):
+        while not self.engines_running and not (self.scheduler.has_requests()):
            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                logger.debug("EngineCore waiting for work.")
                waited = True
@@ -419,10 +418,7 @@ class EngineCoreProc(EngineCore):
            self._handle_client_request(*req)

        if waited:
-            logger.debug(
-                "EngineCore loop active - local unfinished: %s, finished: %s.",
-                self.scheduler.has_unfinished_requests(),
-                self.scheduler.has_finished_requests())
+            logger.debug("EngineCore loop active.")

        # Handle any more client requests.
        while not self.input_queue.empty():
@@ -446,10 +442,6 @@ class EngineCoreProc(EngineCore):
            self.add_request(request)
        elif request_type == EngineCoreRequestType.ABORT:
            self.abort_requests(request)
-        elif request_type == EngineCoreRequestType.START_DP:
-            if not self.global_unfinished_reqs:
-                logger.debug("EngineCore starting idle loop.")
-                self.global_unfinished_reqs = True
        elif request_type == EngineCoreRequestType.UTILITY:
            call_id, method_name, args = request
            output = UtilityOutput(call_id)
@@ -548,9 +540,6 @@ class EngineCoreProc(EngineCore):
                socket.send_multipart(buffers, copy=False)


-ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
-
-
 class DPEngineCoreProc(EngineCoreProc):
    """ZMQ-wrapper for running EngineCore in background process
    in a data parallel context."""
@@ -587,7 +576,9 @@ class DPEngineCoreProc(EngineCoreProc):
                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
                               tp_size))

+        self.local_dp_rank = local_dp_rank
        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.current_wave = 0

        # Initialize the engine after setting up environment.
        super().__init__(input_path, output_path, vllm_config, executor_class,
@@ -602,6 +593,31 @@ class DPEngineCoreProc(EngineCoreProc):
        if dp_group := getattr(self, "dp_group", None):
            stateless_destroy_torch_distributed_process_group(dp_group)

+    def add_request(self, request: EngineCoreRequest):
+        if request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    EngineCoreOutputs(start_wave=self.current_wave))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave: int = request
+            if new_wave >= self.current_wave:
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
    def run_busy_loop(self):
        """Core busy loop of the EngineCore for data parallel case."""

@@ -628,7 +644,7 @@ class DPEngineCoreProc(EngineCoreProc):
                    # up-to-date state is returned in the engine outputs.
                    self._process_engine_step()

-                if not self.global_unfinished_reqs:
+                if not self.engines_running:
                    # All engines are idle.
                    continue

@@ -637,18 +653,23 @@ class DPEngineCoreProc(EngineCoreProc):
                self.execute_dummy_batch()

            # 3) All-reduce operation to determine global unfinished reqs.
-            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+            self.engines_running = self._has_global_unfinished_reqs(
                local_unfinished_reqs)

-            if not self.global_unfinished_reqs:
-                # Notify client that we are pausing the loop.
-                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+            if not self.engines_running:
+                if self.local_dp_rank == 0:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    self.output_queue.put_nowait(
+                        EngineCoreOutputs(wave_complete=self.current_wave))
+                self.current_wave += 1

    def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:

-        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        # Optimization - only perform finish-sync all-reduce every 24 steps.
        self.counter += 1
-        if self.counter != 16:
+        if self.counter != 24:
            return True
        self.counter = 0