Files
vllm/tests/v1/entrypoints/openai/test_thinking_token_budget.py

88 lines
2.6 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""E2E tests for thinking_token_budget with reasoning models."""
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
THINK_BUDGET = 5
@pytest.fixture(scope="module")
def server():
args = [
"--reasoning-parser",
"qwen3",
"--reasoning-config",
'{"reasoning_start_str": "<think>", "reasoning_end_str": "</think>"}',
"--max-model-len",
"2048",
"--enforce-eager",
"--no-async-scheduling",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
"""Test that mixed requests (some with thinking_token_budget, some without)
complete successfully without errors."""
response_with_budget = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
extra_body={"thinking_token_budget": THINK_BUDGET},
)
response_without_budget = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
)
msg_with = response_with_budget.choices[0].message
msg_without = response_without_budget.choices[0].message
assert msg_with.content or getattr(msg_with, "reasoning", None)
assert msg_without.content or getattr(msg_without, "reasoning", None)
@pytest.mark.asyncio
async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
"""Test that thinking_token_budget limits the number of reasoning tokens.
In streaming mode each reasoning delta corresponds to one token, so
counting non-empty reasoning_content chunks gives the exact token count.
"""
reasoning_token_count = 0
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
max_tokens=100,
stream=True,
extra_body={"thinking_token_budget": THINK_BUDGET},
)
async for chunk in stream:
delta = chunk.choices[0].delta
if getattr(delta, "reasoning", None):
reasoning_token_count += 1
assert reasoning_token_count == THINK_BUDGET, (
f"reasoning tokens ({reasoning_token_count}) != "
f"thinking_token_budget ({THINK_BUDGET})"
)