[Tool] adjust_request to reasoning parser, and Gemma4 fixes (#39027)

Signed-off-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-04-08 15:04:04 -04:00
parent e24e0a43a4
commit 8477fe427d
12 changed files with 878 additions and 16 deletions
--- a/tests/reasoning/test_gemma4_reasoning_parser.py
+++ b/tests/reasoning/test_gemma4_reasoning_parser.py
@@ -4,6 +4,9 @@
 import pytest

 from tests.reasoning.utils import run_reasoning_extraction
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
 from vllm.reasoning import ReasoningParser, ReasoningParserManager

 # Using mistral tokenizer as a generic mock since the actual model is not on HF
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
    "is_reasoning_end": True,
 }

+THOUGHT_PREFIX = {
+    "output": "<|channel>thought\nActual reasoning here<channel|>Final answer",
+    "reasoning": "Actual reasoning here",
+    "content": "Final answer",
+    "is_reasoning_end": True,
+}
+THOUGHT_PREFIX_ONLY = {
+    "output": "<|channel>thought\n<channel|>",
+    "reasoning": "",
+    "content": None,
+    "is_reasoning_end": True,
+}
+THOUGHT_PREFIX_MULTILINE = {
+    "output": "<|channel>thought\nLine1\nLine2<channel|>Answer",
+    "reasoning": "Line1\nLine2",
+    "content": "Answer",
+    "is_reasoning_end": True,
+}
+# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
+THOUGHT_PREFIX_DIVERGE = {
+    "output": "<|channel>thousand reasons<channel|>Done",
+    "reasoning": "thousand reasons",
+    "content": "Done",
+    "is_reasoning_end": True,
+}
+# The model isn't reasoning if we're generating tool calls.
+TOOL_CALL_STARTED = {
+    "output": "<|tool_call>",
+    "reasoning": None,
+    "content": "<|tool_call>",
+    "is_reasoning_end": True,
+}
+
 TEST_CASES = [
    pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
    pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"),
@@ -120,17 +156,22 @@ TEST_CASES = [
    pytest.param(False, EMPTY, id="empty"),
    pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"),
    pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"),
+    pytest.param(False, THOUGHT_PREFIX, id="thought_prefix"),
+    pytest.param(True, THOUGHT_PREFIX, id="thought_prefix_streaming"),
+    pytest.param(False, THOUGHT_PREFIX_ONLY, id="thought_prefix_only"),
+    pytest.param(True, THOUGHT_PREFIX_ONLY, id="thought_prefix_only_streaming"),
+    pytest.param(False, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline"),
+    pytest.param(
+        True, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline_streaming"
+    ),
+    pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
+    pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
+    pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
+    pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
 ]


-@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
-def test_gemma4_reasoning(
-    streaming: bool,
-    param_dict: dict,
-    generic_tokenizer,
-):
-    output = param_dict["output"]
-
+def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
    # Resolve token IDs dynamically from the real tokenizer
    vocab = generic_tokenizer.get_vocab()
    start_token_id = vocab["<|channel>"]
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
    else:
        output_tokens += _encode(output)

+    return output_tokens
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_gemma4_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    generic_tokenizer,
+):
+    output = param_dict["output"]
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
+
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
        generic_tokenizer
    )
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
    # Test is_reasoning_end
    is_reasoning_end = parser.is_reasoning_end(output_tokens)
    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+
+def test_gemma4_adjust_request(generic_tokenizer):
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        generic_tokenizer
+    )
+
+    request = ChatCompletionRequest(messages=[], model="test-model")
+    assert request.skip_special_tokens is True
+
+    result = parser.adjust_request(request)
+    assert result.skip_special_tokens is False
+    assert result is request
+
+
+def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
+    output = (
+        "<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
+        "<|turn>user\nThanks<|turn>model\n"
+    )
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        generic_tokenizer
+    )
+    is_reasoning_end = parser.is_reasoning_end(output_tokens)
+    assert not is_reasoning_end