[Bugfix] Fix Qwen3/Qwen3.5 Reasoning Parser (#34779)

Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-21 23:15:35 -08:00
parent 2cbf9656ce
commit 40f88d8318
3 changed files with 233 additions and 47 deletions
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -4,46 +4,79 @@
 import pytest
 from transformers import AutoTokenizer

-from tests.reasoning.utils import run_reasoning_extraction
+from tests.reasoning.utils import (
+    StreamingReasoningReconstructor,
+    run_reasoning_extraction,
+    run_reasoning_extraction_streaming,
+)
 from vllm.reasoning import ReasoningParser, ReasoningParserManager

 parser_name = "qwen3"
 start_token = "<think>"
 end_token = "</think>"

-REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
+REASONING_MODEL_NAMES = [
+    "Qwen/Qwen3-0.6B",
+    "Qwen/Qwen3.5-397B-A17B",
+    "Qwen/Qwen3-4B-Thinking-2507",
+]


-@pytest.fixture(scope="module")
-def qwen3_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+@pytest.fixture(scope="module", params=REASONING_MODEL_NAMES)
+def qwen3_tokenizer(request):
+    return AutoTokenizer.from_pretrained(request.param)


-# 带 <think></think>，非stream
+# --- <think> in prompt, only </think> in output (typical) ---
+
+WITHOUT_START_TOKEN = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+WITHOUT_START_TOKEN_STREAM = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+WITHOUT_START_TOKEN_COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
+# --- <think> present in output (old template / edge case) ---
+
 WITH_THINK = {
    "output": "<think>This is a reasoning section</think>This is the rest",
    "reasoning": "This is a reasoning section",
    "content": "This is the rest",
 }
-# 带 <think></think>，stream
 WITH_THINK_STREAM = {
    "output": "<think>This is a reasoning section</think>This is the rest",
    "reasoning": "This is a reasoning section",
    "content": "This is the rest",
 }
-# 不带 <think></think>，非stream
+
+# --- No think tokens at all (thinking disabled) ---
+
 WITHOUT_THINK = {
    "output": "This is the rest",
    "reasoning": None,
    "content": "This is the rest",
 }
-# 不带 <think></think>，stream
+# In streaming, the parser cannot distinguish "thinking disabled" from
+# "reasoning in progress" when no think tokens have appeared yet.
+# It assumes reasoning. The serving layer handles the "thinking disabled"
+# case by checking prompt_is_reasoning_end_arr before calling the parser.
 WITHOUT_THINK_STREAM = {
    "output": "This is the rest",
-    "reasoning": None,
-    "content": "This is the rest",
+    "reasoning": "This is the rest",
+    "content": None,
 }

+# --- Edge cases ---
+
 COMPLETE_REASONING = {
    "output": "<think>This is a reasoning section</think>",
    "reasoning": "This is a reasoning section",
@@ -57,7 +90,7 @@ MULTILINE_REASONING = {
 ONLY_OPEN_TAG = {
    "output": "<think>This is a reasoning section",
    "reasoning": None,
-    "content": "<think>This is a reasoning section",
+    "content": "This is a reasoning section",
 }

 ONLY_OPEN_TAG_STREAM = {
@@ -67,6 +100,26 @@ ONLY_OPEN_TAG_STREAM = {
 }

 TEST_CASES = [
+    pytest.param(
+        False,
+        WITHOUT_START_TOKEN,
+        id="without_start_token",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_START_TOKEN_STREAM,
+        id="without_start_token_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_START_TOKEN_COMPLETE_REASONING,
+        id="without_start_token_complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_START_TOKEN_COMPLETE_REASONING,
+        id="without_start_token_complete_reasoning_stream",
+    ),
    pytest.param(
        False,
        WITH_THINK,
@@ -140,3 +193,59 @@ def test_reasoning(

    assert reasoning == param_dict["reasoning"]
    assert content == param_dict["content"]
+
+
+# Multi-token delta tests: simulate real-world streaming where a single
+# delta can contain multiple tokens (e.g., speculative decoding).
+MULTI_TOKEN_DELTA_CASES = [
+    pytest.param(
+        # <think> grouped with following text in one delta
+        ["<think>This is a reasoning section", "</think>", "This is the rest"],
+        "This is a reasoning section",
+        "This is the rest",
+        id="start_token_grouped_with_text",
+    ),
+    pytest.param(
+        # </think> grouped with following content in one delta
+        ["reasoning section", "</think>This is the rest"],
+        "reasoning section",
+        "This is the rest",
+        id="end_token_grouped_with_content",
+    ),
+    pytest.param(
+        # <think> and </think> in the same delta, no content after
+        ["<think>reasoning</think>"],
+        "reasoning",
+        None,
+        id="start_and_end_in_one_delta_no_content",
+    ),
+    pytest.param(
+        # No start token, end grouped with content (Qwen3.5 style)
+        ["reasoning section", "</think>content"],
+        "reasoning section",
+        "content",
+        id="no_start_end_grouped_with_content",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "deltas, expected_reasoning, expected_content", MULTI_TOKEN_DELTA_CASES
+)
+def test_reasoning_streaming_multi_token_deltas(
+    deltas: list[str],
+    expected_reasoning: str | None,
+    expected_content: str | None,
+    qwen3_tokenizer,
+):
+    """Test that multi-token deltas don't leak <think> into reasoning."""
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        qwen3_tokenizer
+    )
+
+    reconstructor: StreamingReasoningReconstructor = run_reasoning_extraction_streaming(
+        parser, deltas
+    )
+
+    assert reconstructor.reasoning == expected_reasoning
+    assert (reconstructor.other_content or None) == expected_content
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -900,6 +900,17 @@ class OpenAIServingChat(OpenAIServing):
                        harmony_tools_streamed[i] |= tools_streamed_flag
                    # handle streaming deltas for tools with named tool_choice
                    elif tool_choice_function_name:
+                        # When encountering think end id in prompt_token_ids
+                        # i.e {"enable_thinking": False},
+                        # check BEFORE calling the parser to avoid a spurious
+                        # reasoning delta on the first chunk.
+                        if (
+                            reasoning_parser
+                            and not reasoning_end_arr[i]
+                            and prompt_is_reasoning_end_arr[i]
+                        ):
+                            reasoning_end_arr[i] = True
+
                        if (
                            reasoning_parser
                            and not reasoning_end_arr[i]
@@ -918,16 +929,11 @@ class OpenAIServingChat(OpenAIServing):
                                    output.token_ids,
                                )
                            )
-                            # When encountering think end id in delta_token_ids
-                            # or think end id in prompt_token_ids
-                            # i.e {"enable_thinking": False},
+                            # When encountering think end id in delta_token_ids,
                            # set reasoning status to end.
                            # Only keep 'content', remove 'reasoning'.
-                            if (
-                                reasoning_parser.is_reasoning_end(
-                                    as_list(output.token_ids)
-                                )
-                                or prompt_is_reasoning_end_arr[i]
+                            if reasoning_parser.is_reasoning_end(
+                                as_list(output.token_ids)
                            ):
                                reasoning_end_arr[i] = True
                                if delta_message and delta_message.content:
@@ -1116,14 +1122,23 @@ class OpenAIServingChat(OpenAIServing):

                    # when only reasoning
                    elif reasoning_parser:
-                        delta_message = reasoning_parser.extract_reasoning_streaming(
-                            previous_text,
-                            current_text,
-                            delta_text,
-                            previous_token_ids,
-                            current_token_ids,
-                            output.token_ids,
-                        )
+                        # When encountering think end id in prompt_token_ids
+                        # i.e {"enable_thinking": False},
+                        # set reasoning status to end.
+                        # Route all generated tokens as content directly.
+                        if prompt_is_reasoning_end_arr[i]:
+                            delta_message = DeltaMessage(content=delta_text)
+                        else:
+                            delta_message = (
+                                reasoning_parser.extract_reasoning_streaming(
+                                    previous_text,
+                                    current_text,
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output.token_ids,
+                                )
+                            )
                    # handle streaming just a content delta
                    else:
                        delta_message = DeltaMessage(content=delta_text)
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
+
 from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
 )
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
 )
@@ -12,13 +15,22 @@ from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser

 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
    """
-    Reasoning parser for the Qwen3 model.
+    Reasoning parser for the Qwen3/Qwen3.5 model family.

-    The Qwen3 model uses <think>...</think> tokens to denote reasoning text
-    within its output. The model provides a strict switch to disable reasoning
-    output via the 'enable_thinking=False' parameter. This parser extracts the
-    reasoning content enclosed by <think> and </think> tokens from the model's
-    output.
+    The Qwen3 model family uses <think>...</think> tokens to denote reasoning
+    text. Starting with Qwen3.5, the chat template places <think> in the
+    prompt so only </think> appears in the generated output. The model
+    provides a strict switch to disable reasoning output via the
+    'enable_thinking=False' parameter.
+
+    When thinking is disabled, the template places <think>\\n\\n</think>\\n\\n
+    in the prompt. The serving layer detects this via prompt_is_reasoning_end
+    and routes deltas as content without calling the streaming parser.
+
+    NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507)
+    use an older chat template where the model generates <think> itself.
+    This parser handles both styles: if <think> appears in the generated output
+    it is stripped before extraction (non-streaming) or skipped (streaming).
    """

    @property
@@ -37,31 +49,27 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        """
        Extract reasoning content from the model output.

-        Qwen3 has stricter requirements - it needs both start and end tokens
-        to be present, unlike other models that work with just the end token.
+        The <think> token is placed in the prompt by the chat template,
+        so typically only </think> appears in the generated output.
+        If <think> is present (e.g. from a different template), it is
+        stripped before extraction.

-        For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning
-        - 'xyz' goes to content
+        When thinking is disabled (no </think> in output), returns
+        (None, model_output) to indicate all output is content.

        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
        """

-        # Check if the model output contains both <think> and </think> tokens.
-        if self.start_token not in model_output or self.end_token not in model_output:
-            return None, model_output
-
-        # Check if the <think> is present in the model output, remove it
-        # if it is present.
+        # Strip <think> if present in the generated output.
        model_output_parts = model_output.partition(self.start_token)
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )

-        # Check if the model output contains the </think> tokens.
-        # If the end token is not found, return the model output as is.
        if self.end_token not in model_output:
+            # No end token means thinking is disabled or the model
+            # did not produce reasoning. Treat everything as content.
            return None, model_output

        # Extract reasoning content from the model output.
@@ -69,3 +77,57 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):

        final_content = content or None
        return reasoning, final_content
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a streaming delta.
+
+        Since <think> is placed in the prompt by the chat template, all
+        generated tokens before </think> are reasoning and tokens after
+        are content.
+
+        NOTE: When thinking is disabled, no think tokens appear in the
+        generated output. The serving layer detects this via
+        prompt_is_reasoning_end and routes deltas as content without
+        calling this method.
+        """
+        # Strip <think> from delta if present (old template / edge case
+        # where the model generates <think> itself).
+        if self.start_token_id in delta_token_ids:
+            start_idx = delta_text.find(self.start_token)
+            if start_idx >= 0:
+                delta_text = delta_text[start_idx + len(self.start_token) :]
+
+        if self.end_token_id in delta_token_ids:
+            # End token in this delta: split reasoning from content.
+            end_index = delta_text.find(self.end_token)
+            if end_index >= 0:
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
+                if not reasoning and not content:
+                    return None
+                return DeltaMessage(
+                    reasoning=reasoning if reasoning else None,
+                    content=content if content else None,
+                )
+            # end_token_id in IDs but not in text (already stripped)
+            return None
+
+        # No end token in this delta.
+        if not delta_text:
+            # Nothing left after stripping start token.
+            return None
+        elif self.end_token_id in previous_token_ids:
+            # End token already passed: everything is content now.
+            return DeltaMessage(content=delta_text)
+        else:
+            # No end token yet: still in reasoning phase.
+            return DeltaMessage(reasoning=delta_text)