diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index db2bc16ff..411c7ba48 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -9,6 +9,7 @@ from tests.reasoning.utils import ( run_reasoning_extraction, run_reasoning_extraction_streaming, ) +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.reasoning import ReasoningParser, ReasoningParserManager parser_name = "qwen3" @@ -58,12 +59,14 @@ WITH_THINK_STREAM = { "content": "This is the rest", } -# --- No think tokens at all (thinking disabled) --- +# --- No think tokens at all (thinking enabled, truncated) --- +# With thinking enabled (default), no think tokens means the output was +# truncated before could be generated. All output is reasoning. WITHOUT_THINK = { "output": "This is the rest", - "reasoning": None, - "content": "This is the rest", + "reasoning": "This is the rest", + "content": None, } # In streaming, the parser cannot distinguish "thinking disabled" from # "reasoning in progress" when no think tokens have appeared yet. @@ -87,10 +90,12 @@ MULTILINE_REASONING = { "reasoning": "This is a reasoning\nsection", "content": "This is the rest\nThat", } +# Truncated output: present but no (thinking enabled). +# Everything is reasoning because the output was cut off mid-thought. ONLY_OPEN_TAG = { "output": "This is a reasoning section", - "reasoning": None, - "content": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, } ONLY_OPEN_TAG_STREAM = { @@ -99,6 +104,20 @@ ONLY_OPEN_TAG_STREAM = { "content": None, } +# Truncated output without prefix (Qwen3.5 style where +# is in the prompt). No means truncation — all is reasoning. +TRUNCATED_NO_START_TOKEN = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, +} + +TRUNCATED_NO_START_TOKEN_STREAM = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, +} + TEST_CASES = [ pytest.param( False, @@ -170,6 +189,16 @@ TEST_CASES = [ ONLY_OPEN_TAG_STREAM, id="only_open_tag_stream", ), + pytest.param( + False, + TRUNCATED_NO_START_TOKEN, + id="truncated_no_start_token", + ), + pytest.param( + True, + TRUNCATED_NO_START_TOKEN_STREAM, + id="truncated_no_start_token_stream", + ), ] @@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas( assert reconstructor.reasoning == expected_reasoning assert (reconstructor.other_content or None) == expected_content + + +# --- Tests for enable_thinking=False (thinking explicitly disabled) --- + + +THINKING_DISABLED_CASES = [ + pytest.param( + "This is plain content", + None, + "This is plain content", + id="thinking_disabled_plain_content", + ), + pytest.param( + "Some output without think tokens", + None, + "Some output without think tokens", + id="thinking_disabled_no_think_tokens", + ), +] + + +@pytest.mark.parametrize( + "output, expected_reasoning, expected_content", THINKING_DISABLED_CASES +) +def test_reasoning_thinking_disabled( + output: str, + expected_reasoning: str | None, + expected_content: str | None, + qwen3_tokenizer, +): + """When enable_thinking=False, output without is all content.""" + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( + qwen3_tokenizer, + chat_template_kwargs={"enable_thinking": False}, + ) + + reasoning, content = parser.extract_reasoning( + model_output=output, + request=ChatCompletionRequest(messages=[], model="test-model"), + ) + + assert reasoning == expected_reasoning + assert content == expected_content diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 0c09d4099..df7b22a91 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser +from vllm.tokenizers import TokenizerLike class Qwen3ReasoningParser(BaseThinkingReasoningParser): @@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): it is stripped before extraction (non-streaming) or skipped (streaming). """ + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) + + chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} + # Qwen3 defaults to thinking enabled; only treat output as + # pure content when the user explicitly disables it. + self.thinking_enabled = chat_kwargs.get("enable_thinking", True) + @property def start_token(self) -> str: """The token that starts reasoning content.""" @@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): If is present (e.g. from a different template), it is stripped before extraction. - When thinking is disabled (no in output), returns - (None, model_output) to indicate all output is content. + When thinking is explicitly disabled and no appears, + returns (None, model_output) — all output is content. + Otherwise (thinking enabled, default), a missing means + the output was truncated and everything is reasoning: + returns (model_output, None). Returns: tuple[Optional[str], Optional[str]]: reasoning content and content @@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): ) if self.end_token not in model_output: - # No end token means thinking is disabled or the model - # did not produce reasoning. Treat everything as content. - return None, model_output + if not self.thinking_enabled: + # Thinking explicitly disabled — treat everything as content. + return None, model_output + # Thinking enabled but no : output was truncated. + # Everything generated so far is reasoning. + return model_output, None # Extract reasoning content from the model output. reasoning, _, content = model_output.partition(self.end_token)