[Tool] adjust_request to reasoning parser, and Gemma4 fixes (#39027)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Ben Browning
2026-04-08 15:04:04 -04:00
committed by GitHub
parent e24e0a43a4
commit 8477fe427d
12 changed files with 878 additions and 16 deletions

View File

@@ -4,6 +4,9 @@
import pytest
from tests.reasoning.utils import run_reasoning_extraction
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.reasoning import ReasoningParser, ReasoningParserManager
# Using mistral tokenizer as a generic mock since the actual model is not on HF
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
"is_reasoning_end": True,
}
THOUGHT_PREFIX = {
"output": "<|channel>thought\nActual reasoning here<channel|>Final answer",
"reasoning": "Actual reasoning here",
"content": "Final answer",
"is_reasoning_end": True,
}
THOUGHT_PREFIX_ONLY = {
"output": "<|channel>thought\n<channel|>",
"reasoning": "",
"content": None,
"is_reasoning_end": True,
}
THOUGHT_PREFIX_MULTILINE = {
"output": "<|channel>thought\nLine1\nLine2<channel|>Answer",
"reasoning": "Line1\nLine2",
"content": "Answer",
"is_reasoning_end": True,
}
# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
THOUGHT_PREFIX_DIVERGE = {
"output": "<|channel>thousand reasons<channel|>Done",
"reasoning": "thousand reasons",
"content": "Done",
"is_reasoning_end": True,
}
# The model isn't reasoning if we're generating tool calls.
TOOL_CALL_STARTED = {
"output": "<|tool_call>",
"reasoning": None,
"content": "<|tool_call>",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"),
@@ -120,17 +156,22 @@ TEST_CASES = [
pytest.param(False, EMPTY, id="empty"),
pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"),
pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"),
pytest.param(False, THOUGHT_PREFIX, id="thought_prefix"),
pytest.param(True, THOUGHT_PREFIX, id="thought_prefix_streaming"),
pytest.param(False, THOUGHT_PREFIX_ONLY, id="thought_prefix_only"),
pytest.param(True, THOUGHT_PREFIX_ONLY, id="thought_prefix_only_streaming"),
pytest.param(False, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline"),
pytest.param(
True, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline_streaming"
),
pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_gemma4_reasoning(
streaming: bool,
param_dict: dict,
generic_tokenizer,
):
output = param_dict["output"]
def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
# Resolve token IDs dynamically from the real tokenizer
vocab = generic_tokenizer.get_vocab()
start_token_id = vocab["<|channel>"]
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
else:
output_tokens += _encode(output)
return output_tokens
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_gemma4_reasoning(
streaming: bool,
param_dict: dict,
generic_tokenizer,
):
output = param_dict["output"]
output_tokens = gemma4_encode_output(generic_tokenizer, output)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer
)
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
# Test is_reasoning_end
is_reasoning_end = parser.is_reasoning_end(output_tokens)
assert is_reasoning_end == param_dict["is_reasoning_end"]
def test_gemma4_adjust_request(generic_tokenizer):
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer
)
request = ChatCompletionRequest(messages=[], model="test-model")
assert request.skip_special_tokens is True
result = parser.adjust_request(request)
assert result.skip_special_tokens is False
assert result is request
def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
output = (
"<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
"<|turn>user\nThanks<|turn>model\n"
)
output_tokens = gemma4_encode_output(generic_tokenizer, output)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer
)
is_reasoning_end = parser.is_reasoning_end(output_tokens)
assert not is_reasoning_end

View File

@@ -0,0 +1,345 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Gemma4 chat template rendering."""
from pathlib import Path
import jinja2.sandbox
import pytest
TEMPLATE_PATH = (
Path(__file__).resolve().parent.parent.parent
/ "examples"
/ "tool_chat_template_gemma4.jinja"
)
@pytest.fixture(scope="module")
def gemma4_template():
"""Load and compile the Gemma4 chat template."""
template_str = TEMPLATE_PATH.read_text()
env = jinja2.sandbox.ImmutableSandboxedEnvironment()
return env.from_string(template_str)
def _render(template, messages, **kwargs):
"""Render the template with sensible defaults."""
kwargs.setdefault("bos_token", "<bos>")
kwargs.setdefault("add_generation_prompt", False)
return template.render(messages=messages, **kwargs)
class TestGemma4ChatTemplate:
def test_basic_multiturn_thinking_disabled(self, gemma4_template):
"""With enable_thinking=False (default), generation prompt ends with
an empty thought channel to suppress thinking."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "<|turn>user\n" in result
assert "<|turn>model\n" in result
assert "Hello" in result
assert "Hi there!" in result
assert "How are you?" in result
assert result.rstrip("\n").endswith("<|channel>thought\n<channel|>")
def test_basic_multiturn_thinking_enabled(self, gemma4_template):
"""With enable_thinking=True, generation prompt ends with model
turn opener (no thought suppression)."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
]
result = _render(
gemma4_template,
messages,
add_generation_prompt=True,
enable_thinking=True,
)
assert "<|turn>user\n" in result
assert "<|turn>model\n" in result
assert "Hello" in result
assert "Hi there!" in result
assert "How are you?" in result
assert result.rstrip("\n").endswith("<|turn>model")
def test_system_message(self, gemma4_template):
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hi"},
]
result = _render(gemma4_template, messages)
assert "<|turn>system\n" in result
assert "You are helpful." in result
def test_thinking_enabled(self, gemma4_template):
messages = [{"role": "user", "content": "Think about this"}]
result = _render(
gemma4_template,
messages,
add_generation_prompt=True,
enable_thinking=True,
)
assert "<|think|>" in result
assert "<|turn>system\n" in result
def test_tool_declarations(self, gemma4_template):
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "City name",
}
},
"required": ["city"],
},
},
}
]
messages = [{"role": "user", "content": "What is the weather?"}]
result = _render(
gemma4_template,
messages,
tools=tools,
add_generation_prompt=True,
)
assert "<|tool>" in result
assert "declaration:get_weather" in result
assert "<tool|>" in result
assert '<|"|>City name<|"|>' in result
def test_tool_calls_in_assistant(self, gemma4_template):
messages = [
{"role": "user", "content": "Weather in London?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|tool_call>call:get_weather{" in result
assert "}<tool_call|>" in result
assert '<|"|>London<|"|>' in result
def test_tool_responses_openai_style(self, gemma4_template):
"""role='tool' messages are formatted as <|tool_response> blocks
with content dumped as-is."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": '{"temperature": 15, "condition": "sunny"}',
},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "<|tool_response>" in result
assert "response:get_weather{" in result
assert "<tool_response|>" in result
assert '"temperature": 15' in result
def test_tool_responses_legacy_style(self, gemma4_template):
"""tool_responses embedded on the assistant message."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
"tool_responses": [
{
"name": "get_weather",
"response": {"temperature": 20},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|tool_response>" in result
assert "response:get_weather{" in result
assert "temperature:" in result
def test_generation_prompt_not_after_tool_response(self, gemma4_template):
"""add_generation_prompt=True should NOT add <|turn>model when the
last message type was tool_response (the model turn continues)."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": "sunny",
},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert not result.strip().endswith("<|turn>model\n")
def test_reasoning_in_tool_chains(self, gemma4_template):
"""reasoning field on assistant with tool_calls after last user
message emits <|channel>thought\\n...<channel|>."""
messages = [
{"role": "user", "content": "Calculate something"},
{
"role": "assistant",
"content": "",
"reasoning": "Let me think about this...",
"tool_calls": [
{
"function": {
"name": "calculator",
"arguments": {"expr": "2+2"},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|channel>thought\n" in result
assert "Let me think about this..." in result
assert "<channel|>" in result
def test_reasoning_not_before_last_user(self, gemma4_template):
"""reasoning on assistant BEFORE the last user message is dropped."""
messages = [
{"role": "user", "content": "First"},
{
"role": "assistant",
"content": "Response",
"reasoning": "Old reasoning that should be dropped",
"tool_calls": [
{
"function": {
"name": "fn",
"arguments": {},
},
}
],
},
{"role": "user", "content": "Second"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "Old reasoning" not in result
def test_strip_thinking_in_model_content(self, gemma4_template):
"""<|channel>...<channel|> in model content is stripped by the
strip_thinking macro."""
messages = [
{"role": "user", "content": "Hi"},
{
"role": "assistant",
"content": ("<|channel>internal thought<channel|>Visible answer"),
},
]
result = _render(gemma4_template, messages)
assert "internal thought" not in result
assert "Visible answer" in result
def test_multi_turn_tool_chain(self, gemma4_template):
"""assistant->tool->assistant->tool produces exactly one
<|turn>model (later assistants continue the same turn)."""
messages = [
{"role": "user", "content": "Do two things"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c1",
"function": {"name": "step1", "arguments": {}},
},
],
},
{"role": "tool", "tool_call_id": "c1", "content": "result1"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c2",
"function": {"name": "step2", "arguments": {}},
},
],
},
{"role": "tool", "tool_call_id": "c2", "content": "result2"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert result.count("<|turn>model\n") == 1
def test_format_argument_types(self, gemma4_template):
"""Strings wrapped in <|"|>, booleans as true/false, numbers bare."""
messages = [
{"role": "user", "content": "Test"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"function": {
"name": "test_fn",
"arguments": {
"name": "Alice",
"active": True,
"count": 42,
},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert '<|"|>Alice<|"|>' in result
assert "active:true" in result
assert "count:42" in result

View File

@@ -114,6 +114,19 @@ class TestParseGemma4Args:
result = _parse_gemma4_args("key:")
assert result == {"key": ""}
def test_empty_value_partial_withheld(self):
"""Key with no value is withheld in partial mode to avoid premature emission."""
result = _parse_gemma4_args("key:", partial=True)
assert result == {}
# also with a space after the colon
result = _parse_gemma4_args("key: ", partial=True)
assert result == {}
def test_empty_value_after_other_keys_partial_withheld(self):
"""Trailing key with no value is withheld; earlier keys are kept."""
result = _parse_gemma4_args('name:<|"|>test<|"|>,flag:', partial=True)
assert result == {"name": "test"}
class TestParseGemma4Array:
def test_string_array(self):
@@ -636,3 +649,30 @@ class TestStreamingExtraction:
' <meta charset="UTF-8">\n'
' <meta name="viewport" content="width=device-width">\n'
)
def test_streaming_trailing_bare_bool_not_duplicated(self, parser, mock_request):
"""Trailing bare boolean must not be streamed twice."""
chunks = [
"<|tool_call>",
"call:Edit{",
'file_path:<|"|>src/env.py<|"|>,',
'old_string:<|"|>old_val<|"|>,',
'new_string:<|"|>new_val<|"|>,',
"replace_all:",
"false}",
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
parsed_args = json.loads(args_text)
assert parsed_args == {
"file_path": "src/env.py",
"old_string": "old_val",
"new_string": "new_val",
"replace_all": False,
}
assert args_text.count("replace_all") == 1