[Feature] limit thinking tokens (hard limit) (#20859)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Signed-off-by: Sungjae Lee <sung-jae.lee@navercorp.com> Signed-off-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
87
tests/v1/entrypoints/openai/test_thinking_token_budget.py
Normal file
87
tests/v1/entrypoints/openai/test_thinking_token_budget.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""E2E tests for thinking_token_budget with reasoning models."""
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
|
||||
THINK_BUDGET = 5
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
"--reasoning-config",
|
||||
'{"think_start_str": "<think>", "think_end_str": "</think>"}',
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--no-async-scheduling",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
|
||||
"""Test that mixed requests (some with thinking_token_budget, some without)
|
||||
complete successfully without errors."""
|
||||
|
||||
response_with_budget = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
max_tokens=100,
|
||||
extra_body={"thinking_token_budget": THINK_BUDGET},
|
||||
)
|
||||
response_without_budget = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
max_tokens=100,
|
||||
)
|
||||
|
||||
msg_with = response_with_budget.choices[0].message
|
||||
msg_without = response_without_budget.choices[0].message
|
||||
|
||||
assert msg_with.content or getattr(msg_with, "reasoning", None)
|
||||
assert msg_without.content or getattr(msg_without, "reasoning", None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
|
||||
"""Test that thinking_token_budget limits the number of reasoning tokens.
|
||||
|
||||
In streaming mode each reasoning delta corresponds to one token, so
|
||||
counting non-empty reasoning_content chunks gives the exact token count.
|
||||
"""
|
||||
|
||||
reasoning_token_count = 0
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
max_tokens=100,
|
||||
stream=True,
|
||||
extra_body={"thinking_token_budget": THINK_BUDGET},
|
||||
)
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if getattr(delta, "reasoning", None):
|
||||
reasoning_token_count += 1
|
||||
|
||||
assert reasoning_token_count == THINK_BUDGET, (
|
||||
f"reasoning tokens ({reasoning_token_count}) != "
|
||||
f"thinking_token_budget ({THINK_BUDGET})"
|
||||
)
|
||||
Reference in New Issue
Block a user