2025-04-30 00:32:40 +08:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-04-30 00:32:40 +08:00
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
from tests.reasoning.utils import (
|
|
|
|
|
StreamingReasoningReconstructor,
|
|
|
|
|
run_reasoning_extraction,
|
|
|
|
|
run_reasoning_extraction_streaming,
|
|
|
|
|
)
|
2026-02-27 04:30:45 +08:00
|
|
|
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
2025-04-30 00:32:40 +08:00
|
|
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
|
|
|
|
|
|
|
|
|
parser_name = "qwen3"
|
|
|
|
|
start_token = "<think>"
|
|
|
|
|
end_token = "</think>"
|
|
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
REASONING_MODEL_NAMES = [
|
|
|
|
|
"Qwen/Qwen3-0.6B",
|
|
|
|
|
"Qwen/Qwen3.5-397B-A17B",
|
|
|
|
|
"Qwen/Qwen3-4B-Thinking-2507",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module", params=REASONING_MODEL_NAMES)
|
|
|
|
|
def qwen3_tokenizer(request):
|
|
|
|
|
return AutoTokenizer.from_pretrained(request.param)
|
|
|
|
|
|
2025-04-30 00:32:40 +08:00
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
# --- <think> in prompt, only </think> in output (typical) ---
|
2025-04-30 00:32:40 +08:00
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
WITHOUT_START_TOKEN = {
|
|
|
|
|
"output": "This is a reasoning section</think>This is the rest",
|
|
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": "This is the rest",
|
|
|
|
|
}
|
|
|
|
|
WITHOUT_START_TOKEN_STREAM = {
|
|
|
|
|
"output": "This is a reasoning section</think>This is the rest",
|
|
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": "This is the rest",
|
|
|
|
|
}
|
|
|
|
|
WITHOUT_START_TOKEN_COMPLETE_REASONING = {
|
|
|
|
|
"output": "This is a reasoning section</think>",
|
|
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": None,
|
|
|
|
|
}
|
2025-04-30 00:32:40 +08:00
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
# --- <think> present in output (old template / edge case) ---
|
2025-04-30 00:32:40 +08:00
|
|
|
|
|
|
|
|
WITH_THINK = {
|
|
|
|
|
"output": "<think>This is a reasoning section</think>This is the rest",
|
2025-11-08 04:15:08 -08:00
|
|
|
"reasoning": "This is a reasoning section",
|
2025-04-30 00:32:40 +08:00
|
|
|
"content": "This is the rest",
|
|
|
|
|
}
|
|
|
|
|
WITH_THINK_STREAM = {
|
|
|
|
|
"output": "<think>This is a reasoning section</think>This is the rest",
|
2025-11-08 04:15:08 -08:00
|
|
|
"reasoning": "This is a reasoning section",
|
2025-04-30 00:32:40 +08:00
|
|
|
"content": "This is the rest",
|
|
|
|
|
}
|
2026-02-21 23:15:35 -08:00
|
|
|
|
2026-02-27 04:30:45 +08:00
|
|
|
# --- No think tokens at all (thinking enabled, truncated) ---
|
2026-02-21 23:15:35 -08:00
|
|
|
|
2026-02-27 04:30:45 +08:00
|
|
|
# With thinking enabled (default), no think tokens means the output was
|
|
|
|
|
# truncated before </think> could be generated. All output is reasoning.
|
2025-04-30 00:32:40 +08:00
|
|
|
WITHOUT_THINK = {
|
|
|
|
|
"output": "This is the rest",
|
2026-02-27 04:30:45 +08:00
|
|
|
"reasoning": "This is the rest",
|
|
|
|
|
"content": None,
|
2025-04-30 00:32:40 +08:00
|
|
|
}
|
2026-02-21 23:15:35 -08:00
|
|
|
# In streaming, the parser cannot distinguish "thinking disabled" from
|
|
|
|
|
# "reasoning in progress" when no think tokens have appeared yet.
|
|
|
|
|
# It assumes reasoning. The serving layer handles the "thinking disabled"
|
|
|
|
|
# case by checking prompt_is_reasoning_end_arr before calling the parser.
|
2025-04-30 00:32:40 +08:00
|
|
|
WITHOUT_THINK_STREAM = {
|
|
|
|
|
"output": "This is the rest",
|
2026-02-21 23:15:35 -08:00
|
|
|
"reasoning": "This is the rest",
|
|
|
|
|
"content": None,
|
2025-04-30 00:32:40 +08:00
|
|
|
}
|
|
|
|
|
|
2026-02-21 23:15:35 -08:00
|
|
|
# --- Edge cases ---
|
|
|
|
|
|
2025-04-30 00:32:40 +08:00
|
|
|
COMPLETE_REASONING = {
|
|
|
|
|
"output": "<think>This is a reasoning section</think>",
|
2025-11-08 04:15:08 -08:00
|
|
|
"reasoning": "This is a reasoning section",
|
2025-04-30 00:32:40 +08:00
|
|
|
"content": None,
|
|
|
|
|
}
|
|
|
|
|
MULTILINE_REASONING = {
|
|
|
|
|
"output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
|
2025-11-08 04:15:08 -08:00
|
|
|
"reasoning": "This is a reasoning\nsection",
|
2025-04-30 00:32:40 +08:00
|
|
|
"content": "This is the rest\nThat",
|
|
|
|
|
}
|
2026-02-27 04:30:45 +08:00
|
|
|
# Truncated output: <think> present but no </think> (thinking enabled).
|
|
|
|
|
# Everything is reasoning because the output was cut off mid-thought.
|
2025-04-30 00:32:40 +08:00
|
|
|
ONLY_OPEN_TAG = {
|
|
|
|
|
"output": "<think>This is a reasoning section",
|
2026-02-27 04:30:45 +08:00
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": None,
|
2025-04-30 00:32:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ONLY_OPEN_TAG_STREAM = {
|
|
|
|
|
"output": "<think>This is a reasoning section",
|
2025-11-08 04:15:08 -08:00
|
|
|
"reasoning": "This is a reasoning section",
|
2025-04-30 00:32:40 +08:00
|
|
|
"content": None,
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-27 04:30:45 +08:00
|
|
|
# Truncated output without <think> prefix (Qwen3.5 style where <think>
|
|
|
|
|
# is in the prompt). No </think> means truncation — all is reasoning.
|
|
|
|
|
TRUNCATED_NO_START_TOKEN = {
|
|
|
|
|
"output": "This is a reasoning section",
|
|
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TRUNCATED_NO_START_TOKEN_STREAM = {
|
|
|
|
|
"output": "This is a reasoning section",
|
|
|
|
|
"reasoning": "This is a reasoning section",
|
|
|
|
|
"content": None,
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-30 00:32:40 +08:00
|
|
|
TEST_CASES = [
|
2026-02-21 23:15:35 -08:00
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
WITHOUT_START_TOKEN,
|
|
|
|
|
id="without_start_token",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
WITHOUT_START_TOKEN_STREAM,
|
|
|
|
|
id="without_start_token_stream",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
WITHOUT_START_TOKEN_COMPLETE_REASONING,
|
|
|
|
|
id="without_start_token_complete_reasoning",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
WITHOUT_START_TOKEN_COMPLETE_REASONING,
|
|
|
|
|
id="without_start_token_complete_reasoning_stream",
|
|
|
|
|
),
|
2025-04-30 00:32:40 +08:00
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
WITH_THINK,
|
|
|
|
|
id="with_think",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
WITH_THINK_STREAM,
|
|
|
|
|
id="with_think_stream",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
WITHOUT_THINK,
|
|
|
|
|
id="without_think",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
WITHOUT_THINK_STREAM,
|
|
|
|
|
id="without_think_stream",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
COMPLETE_REASONING,
|
|
|
|
|
id="complete_reasoning",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
COMPLETE_REASONING,
|
|
|
|
|
id="complete_reasoning_stream",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
MULTILINE_REASONING,
|
|
|
|
|
id="multiline_reasoning",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
MULTILINE_REASONING,
|
|
|
|
|
id="multiline_reasoning_stream",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
ONLY_OPEN_TAG,
|
|
|
|
|
id="only_open_tag",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
ONLY_OPEN_TAG_STREAM,
|
|
|
|
|
id="only_open_tag_stream",
|
|
|
|
|
),
|
2026-02-27 04:30:45 +08:00
|
|
|
pytest.param(
|
|
|
|
|
False,
|
|
|
|
|
TRUNCATED_NO_START_TOKEN,
|
|
|
|
|
id="truncated_no_start_token",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
True,
|
|
|
|
|
TRUNCATED_NO_START_TOKEN_STREAM,
|
|
|
|
|
id="truncated_no_start_token_stream",
|
|
|
|
|
),
|
2025-04-30 00:32:40 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
|
|
|
|
|
def test_reasoning(
|
|
|
|
|
streaming: bool,
|
|
|
|
|
param_dict: dict,
|
|
|
|
|
qwen3_tokenizer,
|
|
|
|
|
):
|
|
|
|
|
output = qwen3_tokenizer.tokenize(param_dict["output"])
|
|
|
|
|
output_tokens: list[str] = [
|
|
|
|
|
qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
|
|
|
|
|
]
|
|
|
|
|
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
|
|
|
|
qwen3_tokenizer
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
reasoning, content = run_reasoning_extraction(
|
|
|
|
|
parser, output_tokens, streaming=streaming
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-08 04:15:08 -08:00
|
|
|
assert reasoning == param_dict["reasoning"]
|
2025-04-30 00:32:40 +08:00
|
|
|
assert content == param_dict["content"]
|
2026-02-21 23:15:35 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# Multi-token delta tests: simulate real-world streaming where a single
|
|
|
|
|
# delta can contain multiple tokens (e.g., speculative decoding).
|
|
|
|
|
MULTI_TOKEN_DELTA_CASES = [
|
|
|
|
|
pytest.param(
|
|
|
|
|
# <think> grouped with following text in one delta
|
|
|
|
|
["<think>This is a reasoning section", "</think>", "This is the rest"],
|
|
|
|
|
"This is a reasoning section",
|
|
|
|
|
"This is the rest",
|
|
|
|
|
id="start_token_grouped_with_text",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
# </think> grouped with following content in one delta
|
|
|
|
|
["reasoning section", "</think>This is the rest"],
|
|
|
|
|
"reasoning section",
|
|
|
|
|
"This is the rest",
|
|
|
|
|
id="end_token_grouped_with_content",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
# <think> and </think> in the same delta, no content after
|
|
|
|
|
["<think>reasoning</think>"],
|
|
|
|
|
"reasoning",
|
|
|
|
|
None,
|
|
|
|
|
id="start_and_end_in_one_delta_no_content",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
# No start token, end grouped with content (Qwen3.5 style)
|
|
|
|
|
["reasoning section", "</think>content"],
|
|
|
|
|
"reasoning section",
|
|
|
|
|
"content",
|
|
|
|
|
id="no_start_end_grouped_with_content",
|
|
|
|
|
),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"deltas, expected_reasoning, expected_content", MULTI_TOKEN_DELTA_CASES
|
|
|
|
|
)
|
|
|
|
|
def test_reasoning_streaming_multi_token_deltas(
|
|
|
|
|
deltas: list[str],
|
|
|
|
|
expected_reasoning: str | None,
|
|
|
|
|
expected_content: str | None,
|
|
|
|
|
qwen3_tokenizer,
|
|
|
|
|
):
|
|
|
|
|
"""Test that multi-token deltas don't leak <think> into reasoning."""
|
|
|
|
|
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
|
|
|
|
qwen3_tokenizer
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
reconstructor: StreamingReasoningReconstructor = run_reasoning_extraction_streaming(
|
|
|
|
|
parser, deltas
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert reconstructor.reasoning == expected_reasoning
|
|
|
|
|
assert (reconstructor.other_content or None) == expected_content
|
2026-02-27 04:30:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- Tests for enable_thinking=False (thinking explicitly disabled) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
THINKING_DISABLED_CASES = [
|
|
|
|
|
pytest.param(
|
|
|
|
|
"This is plain content",
|
|
|
|
|
None,
|
|
|
|
|
"This is plain content",
|
|
|
|
|
id="thinking_disabled_plain_content",
|
|
|
|
|
),
|
|
|
|
|
pytest.param(
|
|
|
|
|
"Some output without think tokens",
|
|
|
|
|
None,
|
|
|
|
|
"Some output without think tokens",
|
|
|
|
|
id="thinking_disabled_no_think_tokens",
|
|
|
|
|
),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"output, expected_reasoning, expected_content", THINKING_DISABLED_CASES
|
|
|
|
|
)
|
|
|
|
|
def test_reasoning_thinking_disabled(
|
|
|
|
|
output: str,
|
|
|
|
|
expected_reasoning: str | None,
|
|
|
|
|
expected_content: str | None,
|
|
|
|
|
qwen3_tokenizer,
|
|
|
|
|
):
|
|
|
|
|
"""When enable_thinking=False, output without </think> is all content."""
|
|
|
|
|
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
|
|
|
|
qwen3_tokenizer,
|
|
|
|
|
chat_template_kwargs={"enable_thinking": False},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
reasoning, content = parser.extract_reasoning(
|
|
|
|
|
model_output=output,
|
|
|
|
|
request=ChatCompletionRequest(messages=[], model="test-model"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert reasoning == expected_reasoning
|
|
|
|
|
assert content == expected_content
|