examples/offline_inference/pause_resume.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test for pause/resume with keep mode.

This test uses concurrent tasks to verify the engine truly stops generating
during pause:
1. Generator task: continuously generates and logs time between tokens
2. Controller task: sends pause/resume commands

If the engine properly pauses, we should see a gap in token timestamps
matching the pause duration.
"""

import asyncio
import time

from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM

PAUSE_DURATION = 3.0  # seconds


async def main():
    # Create engine with a small model
    engine_args = AsyncEngineArgs(
        model="facebook/opt-125m",
        enforce_eager=True,
    )
    engine = AsyncLLM.from_engine_args(engine_args)

    prompt = "Write a story about a dragon. Once upon a time"
    sampling_params = SamplingParams(max_tokens=30, ignore_eos=True)

    # Track token arrival times
    token_times: list[tuple[int, float]] = []  # (token_count, timestamp)
    pause_time: float = 0
    resume_time: float = 0
    pause_token_idx: int = 0  # Index in token_times when pause occurred

    async def generator_task():
        """Generate tokens and record timestamps."""
        async for output in engine.generate(
            request_id="test-req",
            prompt=prompt,
            sampling_params=sampling_params,
        ):
            token_count = len(output.outputs[0].token_ids)
            token_times.append((token_count, time.monotonic()))
            print(
                f"Token {token_count} arrived:"
                f"T={token_times[-1][1] - token_times[0][1]:.3f}s"
            )
        return output

    async def controller_task():
        """Pause and resume the engine after some tokens generated."""
        nonlocal pause_time, resume_time, pause_token_idx

        # Wait for some tokens to be generated
        while len(token_times) < 5:
            await asyncio.sleep(0.01)

        print(f"\nPausing engine (keep mode) at token {len(token_times)}")
        pause_time = time.monotonic()
        await engine.pause_generation(mode="keep")
        pause_token_idx = len(token_times)
        print(f"Paused! Sleeping for {PAUSE_DURATION}s...")

        # Sleep while paused - no tokens should be generated during this time
        await asyncio.sleep(PAUSE_DURATION)

        print("Resuming engine...")
        await engine.resume_generation()
        resume_time = time.monotonic()
        print("Resumed!\n")

    # Run both tasks concurrently
    gen_task = asyncio.create_task(generator_task())
    ctrl_task = asyncio.create_task(controller_task())

    final_output, _ = await asyncio.gather(gen_task, ctrl_task)

    # Verify the pause actually stopped generation.
    # The gap after the pause token should be approximately the sleep duration.
    pause_gap = token_times[pause_token_idx][1] - token_times[pause_token_idx - 1][1]
    print(
        f"\nGap after pause (token {pause_token_idx - 1} -> {pause_token_idx}): "
        f"{pause_gap:.3f}s"
    )
    if pause_gap >= PAUSE_DURATION * 0.9:
        print(f"✓ Test passed! Engine paused for ~{pause_gap:.1f}s")
    else:
        print(
            f"✗ Test failed! Expected ~{PAUSE_DURATION}s gap after pause, "
            f"got {pause_gap:.3f}s"
        )
        raise AssertionError("Engine did not properly pause")

    # Verify request completed
    assert final_output.finished, "Request should have finished"
    assert len(final_output.outputs[0].token_ids) == 30, "Should have all tokens"

    engine.shutdown()


if __name__ == "__main__":
    asyncio.run(main())
[Feat][RL] Pause and Resume with keep requests for single engine (#32351) Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Aaron Hao <ahao@anyscale.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> 2026-02-06 16:08:58 -08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""Test for pause/resume with keep mode.`

			`This test uses concurrent tasks to verify the engine truly stops generating`
			`during pause:`
			`1. Generator task: continuously generates and logs time between tokens`
			`2. Controller task: sends pause/resume commands`

			`If the engine properly pauses, we should see a gap in token timestamps`
			`matching the pause duration.`
			`"""`

			`import asyncio`
			`import time`

			`from vllm import SamplingParams`
			`from vllm.engine.arg_utils import AsyncEngineArgs`
			`from vllm.v1.engine.async_llm import AsyncLLM`

			`PAUSE_DURATION = 3.0 # seconds`


			`async def main():`
			`# Create engine with a small model`
			`engine_args = AsyncEngineArgs(`
			`model="facebook/opt-125m",`
			`enforce_eager=True,`
			`)`
			`engine = AsyncLLM.from_engine_args(engine_args)`

			`prompt = "Write a story about a dragon. Once upon a time"`
			`sampling_params = SamplingParams(max_tokens=30, ignore_eos=True)`

			`# Track token arrival times`
			`token_times: list[tuple[int, float]] = [] # (token_count, timestamp)`
			`pause_time: float = 0`
			`resume_time: float = 0`
			`pause_token_idx: int = 0 # Index in token_times when pause occurred`

			`async def generator_task():`
			`"""Generate tokens and record timestamps."""`
			`async for output in engine.generate(`
			`request_id="test-req",`
			`prompt=prompt,`
			`sampling_params=sampling_params,`
			`):`
			`token_count = len(output.outputs[0].token_ids)`
			`token_times.append((token_count, time.monotonic()))`
			`print(`
			`f"Token {token_count} arrived:"`
			`f"T={token_times[-1][1] - token_times[0][1]:.3f}s"`
			`)`
			`return output`

			`async def controller_task():`
			`"""Pause and resume the engine after some tokens generated."""`
			`nonlocal pause_time, resume_time, pause_token_idx`

			`# Wait for some tokens to be generated`
			`while len(token_times) < 5:`
			`await asyncio.sleep(0.01)`

			`print(f"\nPausing engine (keep mode) at token {len(token_times)}")`
			`pause_time = time.monotonic()`
			`await engine.pause_generation(mode="keep")`
			`pause_token_idx = len(token_times)`
			`print(f"Paused! Sleeping for {PAUSE_DURATION}s...")`

			`# Sleep while paused - no tokens should be generated during this time`
			`await asyncio.sleep(PAUSE_DURATION)`

			`print("Resuming engine...")`
			`await engine.resume_generation()`
			`resume_time = time.monotonic()`
			`print("Resumed!\n")`

			`# Run both tasks concurrently`
			`gen_task = asyncio.create_task(generator_task())`
			`ctrl_task = asyncio.create_task(controller_task())`

			`final_output, _ = await asyncio.gather(gen_task, ctrl_task)`

			`# Verify the pause actually stopped generation.`
			`# The gap after the pause token should be approximately the sleep duration.`
			`pause_gap = token_times[pause_token_idx][1] - token_times[pause_token_idx - 1][1]`
			`print(`
			`f"\nGap after pause (token {pause_token_idx - 1} -> {pause_token_idx}): "`
			`f"{pause_gap:.3f}s"`
			`)`
			`if pause_gap >= PAUSE_DURATION * 0.9:`
			`print(f"✓ Test passed! Engine paused for ~{pause_gap:.1f}s")`
			`else:`
			`print(`
			`f"✗ Test failed! Expected ~{PAUSE_DURATION}s gap after pause, "`
			`f"got {pause_gap:.3f}s"`
			`)`
			`raise AssertionError("Engine did not properly pause")`

			`# Verify request completed`
			`assert final_output.finished, "Request should have finished"`
			`assert len(final_output.outputs[0].token_ids) == 30, "Should have all tokens"`

			`engine.shutdown()`


			`if __name__ == "__main__":`
			`asyncio.run(main())`