diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index db2bc16ff..411c7ba48 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -9,6 +9,7 @@ from tests.reasoning.utils import (
run_reasoning_extraction,
run_reasoning_extraction_streaming,
)
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "qwen3"
@@ -58,12 +59,14 @@ WITH_THINK_STREAM = {
"content": "This is the rest",
}
-# --- No think tokens at all (thinking disabled) ---
+# --- No think tokens at all (thinking enabled, truncated) ---
+# With thinking enabled (default), no think tokens means the output was
+# truncated before could be generated. All output is reasoning.
WITHOUT_THINK = {
"output": "This is the rest",
- "reasoning": None,
- "content": "This is the rest",
+ "reasoning": "This is the rest",
+ "content": None,
}
# In streaming, the parser cannot distinguish "thinking disabled" from
# "reasoning in progress" when no think tokens have appeared yet.
@@ -87,10 +90,12 @@ MULTILINE_REASONING = {
"reasoning": "This is a reasoning\nsection",
"content": "This is the rest\nThat",
}
+# Truncated output: present but no (thinking enabled).
+# Everything is reasoning because the output was cut off mid-thought.
ONLY_OPEN_TAG = {
"output": "This is a reasoning section",
- "reasoning": None,
- "content": "This is a reasoning section",
+ "reasoning": "This is a reasoning section",
+ "content": None,
}
ONLY_OPEN_TAG_STREAM = {
@@ -99,6 +104,20 @@ ONLY_OPEN_TAG_STREAM = {
"content": None,
}
+# Truncated output without prefix (Qwen3.5 style where
+# is in the prompt). No means truncation — all is reasoning.
+TRUNCATED_NO_START_TOKEN = {
+ "output": "This is a reasoning section",
+ "reasoning": "This is a reasoning section",
+ "content": None,
+}
+
+TRUNCATED_NO_START_TOKEN_STREAM = {
+ "output": "This is a reasoning section",
+ "reasoning": "This is a reasoning section",
+ "content": None,
+}
+
TEST_CASES = [
pytest.param(
False,
@@ -170,6 +189,16 @@ TEST_CASES = [
ONLY_OPEN_TAG_STREAM,
id="only_open_tag_stream",
),
+ pytest.param(
+ False,
+ TRUNCATED_NO_START_TOKEN,
+ id="truncated_no_start_token",
+ ),
+ pytest.param(
+ True,
+ TRUNCATED_NO_START_TOKEN_STREAM,
+ id="truncated_no_start_token_stream",
+ ),
]
@@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas(
assert reconstructor.reasoning == expected_reasoning
assert (reconstructor.other_content or None) == expected_content
+
+
+# --- Tests for enable_thinking=False (thinking explicitly disabled) ---
+
+
+THINKING_DISABLED_CASES = [
+ pytest.param(
+ "This is plain content",
+ None,
+ "This is plain content",
+ id="thinking_disabled_plain_content",
+ ),
+ pytest.param(
+ "Some output without think tokens",
+ None,
+ "Some output without think tokens",
+ id="thinking_disabled_no_think_tokens",
+ ),
+]
+
+
+@pytest.mark.parametrize(
+ "output, expected_reasoning, expected_content", THINKING_DISABLED_CASES
+)
+def test_reasoning_thinking_disabled(
+ output: str,
+ expected_reasoning: str | None,
+ expected_content: str | None,
+ qwen3_tokenizer,
+):
+ """When enable_thinking=False, output without is all content."""
+ parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+ qwen3_tokenizer,
+ chat_template_kwargs={"enable_thinking": False},
+ )
+
+ reasoning, content = parser.extract_reasoning(
+ model_output=output,
+ request=ChatCompletionRequest(messages=[], model="test-model"),
+ )
+
+ assert reasoning == expected_reasoning
+ assert content == expected_content
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 0c09d4099..df7b22a91 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tokenizers import TokenizerLike
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
it is stripped before extraction (non-streaming) or skipped (streaming).
"""
+ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+ super().__init__(tokenizer, *args, **kwargs)
+
+ chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+ # Qwen3 defaults to thinking enabled; only treat output as
+ # pure content when the user explicitly disables it.
+ self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
+
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
@@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
If is present (e.g. from a different template), it is
stripped before extraction.
- When thinking is disabled (no in output), returns
- (None, model_output) to indicate all output is content.
+ When thinking is explicitly disabled and no appears,
+ returns (None, model_output) — all output is content.
+ Otherwise (thinking enabled, default), a missing means
+ the output was truncated and everything is reasoning:
+ returns (model_output, None).
Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
@@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
)
if self.end_token not in model_output:
- # No end token means thinking is disabled or the model
- # did not produce reasoning. Treat everything as content.
- return None, model_output
+ if not self.thinking_enabled:
+ # Thinking explicitly disabled — treat everything as content.
+ return None, model_output
+ # Thinking enabled but no : output was truncated.
+ # Everything generated so far is reasoning.
+ return model_output, None
# Extract reasoning content from the model output.
reasoning, _, content = model_output.partition(self.end_token)