[RL] Add Pause and Resume Generation for Asynchronous RL Training (#28037)

Signed-off-by: SamitHuang <285365963@qq.com>
Signed-off-by: Samit <285365963@qq.com>
Signed-off-by: samithuang <285365963@qq.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
Samit
2025-11-20 19:01:03 +08:00
committed by GitHub
parent c9e093116c
commit 371b1d4c61
4 changed files with 182 additions and 0 deletions

View File

@@ -149,6 +149,33 @@ class EngineClient(ABC):
"""Load a new LoRA adapter into the engine for future requests."""
...
@abstractmethod
async def pause_generation(
self,
*,
wait_for_inflight_requests: bool = False,
clear_cache: bool = True,
) -> None:
"""Pause new generation/encoding requests.
Args:
wait_for_inflight_requests: When ``True`` waits for in-flight requests
to finish before pausing. When ``False`` (default), aborts in-flight
requests immediately.
clear_cache: Whether to clear KV and prefix caches after draining.
"""
...
@abstractmethod
async def resume_generation(self) -> None:
"""Resume accepting generation/encoding requests."""
...
@abstractmethod
async def is_paused(self) -> bool:
"""Return whether the engine is currently paused."""
...
async def scale_elastic_ep(
self, new_data_parallel_size: int, drain_timeout: int = 300
) -> None: