[RL] Add Pause and Resume Generation for Asynchronous RL Training (#28037)

Signed-off-by: SamitHuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> Signed-off-by: samithuang <285365963@qq.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-11-20 19:01:03 +08:00
parent c9e093116c
commit 371b1d4c61
4 changed files with 182 additions and 0 deletions
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -149,6 +149,33 @@ class EngineClient(ABC):
        """Load a new LoRA adapter into the engine for future requests."""
        ...

+    @abstractmethod
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """Pause new generation/encoding requests.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight requests
+                to finish before pausing. When ``False`` (default), aborts in-flight
+                requests immediately.
+            clear_cache: Whether to clear KV and prefix caches after draining.
+        """
+        ...
+
+    @abstractmethod
+    async def resume_generation(self) -> None:
+        """Resume accepting generation/encoding requests."""
+        ...
+
+    @abstractmethod
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+        ...
+
    async def scale_elastic_ep(
        self, new_data_parallel_size: int, drain_timeout: int = 300
    ) -> None: