[Feat][RL] Pause and Resume with keep requests for single engine (#32351)

Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Aaron Hao <ahao@anyscale.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-06 16:08:58 -08:00
parent 4a2d00eafd
commit 89a385d79f
8 changed files with 536 additions and 30 deletions
--- a/examples/offline_inference/pause_resume.py
+++ b/examples/offline_inference/pause_resume.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test for pause/resume with keep mode.
+
+This test uses concurrent tasks to verify the engine truly stops generating
+during pause:
+1. Generator task: continuously generates and logs time between tokens
+2. Controller task: sends pause/resume commands
+
+If the engine properly pauses, we should see a gap in token timestamps
+matching the pause duration.
+"""
+
+import asyncio
+import time
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+
+PAUSE_DURATION = 3.0  # seconds
+
+
+async def main():
+    # Create engine with a small model
+    engine_args = AsyncEngineArgs(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+    )
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    prompt = "Write a story about a dragon. Once upon a time"
+    sampling_params = SamplingParams(max_tokens=30, ignore_eos=True)
+
+    # Track token arrival times
+    token_times: list[tuple[int, float]] = []  # (token_count, timestamp)
+    pause_time: float = 0
+    resume_time: float = 0
+    pause_token_idx: int = 0  # Index in token_times when pause occurred
+
+    async def generator_task():
+        """Generate tokens and record timestamps."""
+        async for output in engine.generate(
+            request_id="test-req",
+            prompt=prompt,
+            sampling_params=sampling_params,
+        ):
+            token_count = len(output.outputs[0].token_ids)
+            token_times.append((token_count, time.monotonic()))
+            print(
+                f"Token {token_count} arrived:"
+                f"T={token_times[-1][1] - token_times[0][1]:.3f}s"
+            )
+        return output
+
+    async def controller_task():
+        """Pause and resume the engine after some tokens generated."""
+        nonlocal pause_time, resume_time, pause_token_idx
+
+        # Wait for some tokens to be generated
+        while len(token_times) < 5:
+            await asyncio.sleep(0.01)
+
+        print(f"\nPausing engine (keep mode) at token {len(token_times)}")
+        pause_time = time.monotonic()
+        await engine.pause_generation(mode="keep")
+        pause_token_idx = len(token_times)
+        print(f"Paused! Sleeping for {PAUSE_DURATION}s...")
+
+        # Sleep while paused - no tokens should be generated during this time
+        await asyncio.sleep(PAUSE_DURATION)
+
+        print("Resuming engine...")
+        await engine.resume_generation()
+        resume_time = time.monotonic()
+        print("Resumed!\n")
+
+    # Run both tasks concurrently
+    gen_task = asyncio.create_task(generator_task())
+    ctrl_task = asyncio.create_task(controller_task())
+
+    final_output, _ = await asyncio.gather(gen_task, ctrl_task)
+
+    # Verify the pause actually stopped generation.
+    # The gap after the pause token should be approximately the sleep duration.
+    pause_gap = token_times[pause_token_idx][1] - token_times[pause_token_idx - 1][1]
+    print(
+        f"\nGap after pause (token {pause_token_idx - 1} -> {pause_token_idx}): "
+        f"{pause_gap:.3f}s"
+    )
+    if pause_gap >= PAUSE_DURATION * 0.9:
+        print(f"✓ Test passed! Engine paused for ~{pause_gap:.1f}s")
+    else:
+        print(
+            f"✗ Test failed! Expected ~{PAUSE_DURATION}s gap after pause, "
+            f"got {pause_gap:.3f}s"
+        )
+        raise AssertionError("Engine did not properly pause")
+
+    # Verify request completed
+    assert final_output.finished, "Request should have finished"
+    assert len(final_output.outputs[0].token_ids) == 30, "Should have all tokens"
+
+    engine.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())