[Feat][RL] Pause and Resume with keep requests for single engine (#32351)

Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Aaron Hao <ahao@anyscale.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
Aaron Hao
2026-02-06 16:08:58 -08:00
committed by GitHub
parent 4a2d00eafd
commit 89a385d79f
8 changed files with 536 additions and 30 deletions

View File

@@ -3,7 +3,7 @@
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator, Iterable, Mapping
from typing import Any
from typing import TYPE_CHECKING, Any
from vllm.config import ModelConfig, VllmConfig
from vllm.distributed.weight_transfer.base import (
@@ -22,6 +22,9 @@ from vllm.tasks import SupportedTask
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.input_processor import InputProcessor
if TYPE_CHECKING:
from vllm.v1.engine import PauseMode
class EngineClient(ABC):
"""Protocol class for Clients to Engine"""
@@ -158,16 +161,22 @@ class EngineClient(ABC):
async def pause_generation(
self,
*,
mode: "PauseMode" = "abort",
wait_for_inflight_requests: bool = False,
clear_cache: bool = True,
) -> None:
"""Pause new generation/encoding requests.
Args:
wait_for_inflight_requests: When ``True`` waits for in-flight requests
to finish before pausing. When ``False`` (default), aborts in-flight
requests immediately.
clear_cache: Whether to clear KV and prefix caches after draining.
mode: How to handle in-flight requests:
- ``"abort"``: Abort all in-flight requests immediately
and return partial results with "abort" reason (default).
- ``"wait"``: Wait for in-flight requests to complete.
- ``"keep"``: Freeze requests in queue; they resume on
:meth:`resume_generation`.
wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
clear_cache: DEPRECATED. Whether to clear KV and prefix caches
after draining.
"""
...