[Feature] limit thinking tokens (hard limit) (#20859)

Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Signed-off-by: Sungjae Lee <sung-jae.lee@navercorp.com> Signed-off-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-03-25 01:53:07 +09:00
parent 8de5261e69
commit 4731884796
13 changed files with 702 additions and 12 deletions
--- a/tests/v1/entrypoints/openai/test_thinking_token_budget.py
+++ b/tests/v1/entrypoints/openai/test_thinking_token_budget.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E tests for thinking_token_budget with reasoning models."""
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
+THINK_BUDGET = 5
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--reasoning-config",
+        '{"think_start_str": "<think>", "think_end_str": "</think>"}',
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--no-async-scheduling",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
+    """Test that mixed requests (some with thinking_token_budget, some without)
+    complete successfully without errors."""
+
+    response_with_budget = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+        extra_body={"thinking_token_budget": THINK_BUDGET},
+    )
+    response_without_budget = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+    )
+
+    msg_with = response_with_budget.choices[0].message
+    msg_without = response_without_budget.choices[0].message
+
+    assert msg_with.content or getattr(msg_with, "reasoning", None)
+    assert msg_without.content or getattr(msg_without, "reasoning", None)
+
+
+@pytest.mark.asyncio
+async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
+    """Test that thinking_token_budget limits the number of reasoning tokens.
+
+    In streaming mode each reasoning delta corresponds to one token, so
+    counting non-empty reasoning_content chunks gives the exact token count.
+    """
+
+    reasoning_token_count = 0
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+        stream=True,
+        extra_body={"thinking_token_budget": THINK_BUDGET},
+    )
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if getattr(delta, "reasoning", None):
+            reasoning_token_count += 1
+
+    assert reasoning_token_count == THINK_BUDGET, (
+        f"reasoning tokens ({reasoning_token_count}) != "
+        f"thinking_token_budget ({THINK_BUDGET})"
+    )