Files
vllm/tests/v1/entrypoints/openai/test_thinking_token_budget.py
Sungjae Lee 4731884796 [Feature] limit thinking tokens (hard limit) (#20859)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: Sungjae Lee <sung-jae.lee@navercorp.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-03-24 09:53:07 -07:00

88 lines
2.6 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""E2E tests for thinking_token_budget with reasoning models."""
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
THINK_BUDGET = 5
@pytest.fixture(scope="module")
def server():
args = [
"--reasoning-parser",
"qwen3",
"--reasoning-config",
'{"think_start_str": "<think>", "think_end_str": "</think>"}',
"--max-model-len",
"2048",
"--enforce-eager",
"--no-async-scheduling",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
"""Test that mixed requests (some with thinking_token_budget, some without)
complete successfully without errors."""
response_with_budget = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
extra_body={"thinking_token_budget": THINK_BUDGET},
)
response_without_budget = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
)
msg_with = response_with_budget.choices[0].message
msg_without = response_without_budget.choices[0].message
assert msg_with.content or getattr(msg_with, "reasoning", None)
assert msg_without.content or getattr(msg_without, "reasoning", None)
@pytest.mark.asyncio
async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
"""Test that thinking_token_budget limits the number of reasoning tokens.
In streaming mode each reasoning delta corresponds to one token, so
counting non-empty reasoning_content chunks gives the exact token count.
"""
reasoning_token_count = 0
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
stream=True,
extra_body={"thinking_token_budget": THINK_BUDGET},
)
async for chunk in stream:
delta = chunk.choices[0].delta
if getattr(delta, "reasoning", None):
reasoning_token_count += 1
assert reasoning_token_count == THINK_BUDGET, (
f"reasoning tokens ({reasoning_token_count}) != "
f"thinking_token_budget ({THINK_BUDGET})"
)