[Tool] adjust_request to reasoning parser, and Gemma4 fixes (#39027)
Signed-off-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -4,6 +4,9 @@
|
||||
import pytest
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
)
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
# Using mistral tokenizer as a generic mock since the actual model is not on HF
|
||||
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
THOUGHT_PREFIX = {
|
||||
"output": "<|channel>thought\nActual reasoning here<channel|>Final answer",
|
||||
"reasoning": "Actual reasoning here",
|
||||
"content": "Final answer",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
THOUGHT_PREFIX_ONLY = {
|
||||
"output": "<|channel>thought\n<channel|>",
|
||||
"reasoning": "",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
THOUGHT_PREFIX_MULTILINE = {
|
||||
"output": "<|channel>thought\nLine1\nLine2<channel|>Answer",
|
||||
"reasoning": "Line1\nLine2",
|
||||
"content": "Answer",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
|
||||
THOUGHT_PREFIX_DIVERGE = {
|
||||
"output": "<|channel>thousand reasons<channel|>Done",
|
||||
"reasoning": "thousand reasons",
|
||||
"content": "Done",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
# The model isn't reasoning if we're generating tool calls.
|
||||
TOOL_CALL_STARTED = {
|
||||
"output": "<|tool_call>",
|
||||
"reasoning": None,
|
||||
"content": "<|tool_call>",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
|
||||
pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"),
|
||||
@@ -120,17 +156,22 @@ TEST_CASES = [
|
||||
pytest.param(False, EMPTY, id="empty"),
|
||||
pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"),
|
||||
pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"),
|
||||
pytest.param(False, THOUGHT_PREFIX, id="thought_prefix"),
|
||||
pytest.param(True, THOUGHT_PREFIX, id="thought_prefix_streaming"),
|
||||
pytest.param(False, THOUGHT_PREFIX_ONLY, id="thought_prefix_only"),
|
||||
pytest.param(True, THOUGHT_PREFIX_ONLY, id="thought_prefix_only_streaming"),
|
||||
pytest.param(False, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline"),
|
||||
pytest.param(
|
||||
True, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline_streaming"
|
||||
),
|
||||
pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
|
||||
pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
|
||||
pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
|
||||
pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
|
||||
def test_gemma4_reasoning(
|
||||
streaming: bool,
|
||||
param_dict: dict,
|
||||
generic_tokenizer,
|
||||
):
|
||||
output = param_dict["output"]
|
||||
|
||||
def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
|
||||
# Resolve token IDs dynamically from the real tokenizer
|
||||
vocab = generic_tokenizer.get_vocab()
|
||||
start_token_id = vocab["<|channel>"]
|
||||
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
|
||||
else:
|
||||
output_tokens += _encode(output)
|
||||
|
||||
return output_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
|
||||
def test_gemma4_reasoning(
|
||||
streaming: bool,
|
||||
param_dict: dict,
|
||||
generic_tokenizer,
|
||||
):
|
||||
output = param_dict["output"]
|
||||
output_tokens = gemma4_encode_output(generic_tokenizer, output)
|
||||
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
||||
generic_tokenizer
|
||||
)
|
||||
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
|
||||
# Test is_reasoning_end
|
||||
is_reasoning_end = parser.is_reasoning_end(output_tokens)
|
||||
assert is_reasoning_end == param_dict["is_reasoning_end"]
|
||||
|
||||
|
||||
def test_gemma4_adjust_request(generic_tokenizer):
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
||||
generic_tokenizer
|
||||
)
|
||||
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
assert request.skip_special_tokens is True
|
||||
|
||||
result = parser.adjust_request(request)
|
||||
assert result.skip_special_tokens is False
|
||||
assert result is request
|
||||
|
||||
|
||||
def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
|
||||
output = (
|
||||
"<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
|
||||
"<|turn>user\nThanks<|turn>model\n"
|
||||
)
|
||||
output_tokens = gemma4_encode_output(generic_tokenizer, output)
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
||||
generic_tokenizer
|
||||
)
|
||||
is_reasoning_end = parser.is_reasoning_end(output_tokens)
|
||||
assert not is_reasoning_end
|
||||
|
||||
Reference in New Issue
Block a user