diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 08a0dd69e..3315c0949 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models: | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | +| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ | | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ | @@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models: IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. DeepSeek-V3.1 tool calling is supported in non-thinking mode. + Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`. ## Quickstart diff --git a/tests/reasoning/test_holo2_reasoning_parser.py b/tests/reasoning/test_holo2_reasoning_parser.py new file mode 100644 index 000000000..438bb2e95 --- /dev/null +++ b/tests/reasoning/test_holo2_reasoning_parser.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager +from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser +from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser + +REASONING_MODEL_NAME = "HCompany/Holo2-4B" + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.mark.parametrize( + "thinking,expected_parser_type", + [ + (True, DeepSeekR1ReasoningParser), + (False, IdentityReasoningParser), + ], +) +def test_parser_selection(tokenizer, thinking, expected_parser_type): + parser = Holo2ReasoningParser( + tokenizer, + chat_template_kwargs={ + "thinking": thinking, + }, + ) + + assert isinstance(parser._parser, expected_parser_type) + + +def test_holo2_default_parser_is_deepseekr1(tokenizer): + parser = Holo2ReasoningParser(tokenizer) + + assert isinstance(parser._parser, DeepSeekR1ReasoningParser) + + +def test_holo2_supports_structured_output(tokenizer): + # Structured output manager uses the reasoning parser to check if the + # reasoning content is ended before applying the grammar. The main function + # used is is_reasoning_end. This test checks if the parser is able to + # correctly identify the end of the reasoning content. + + # important to not pass chat_template_kwargs here as it is done in the + # StructuredOutputManager + parser = Holo2ReasoningParser(tokenizer) + + end_token_id = tokenizer.encode("", add_special_tokens=False)[0] + + assert parser.is_reasoning_end([1, 2, 4, end_token_id]) + assert not parser.is_reasoning_end([1, 2, 4]) + assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5]) + + +# thinking is True, non-streaming +WITH_THINK = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", +} +# thinking is True, streaming +WITH_THINK_STREAM = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", +} +# thinking is False, non-streaming +THINKING_DISABLED = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False, streaming +THINKING_DISABLED_STREAM = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False but the model output , non-streaming +THINKING_DISABLED_WITH_CLOSE_TAG = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False but the model output , streaming +THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = { + "output": "some textThis is the rest", + "reasoning": None, + "content": "some textThis is the rest", +} +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, +} + +TEST_CASES = [ + pytest.param( + False, + WITH_THINK, + None, + id="with_think", + ), + pytest.param( + True, + WITH_THINK_STREAM, + None, + id="with_think_stream", + ), + pytest.param( + False, + WITH_THINK, + {"thinking": True}, + id="with_think_enabled", + ), + pytest.param( + True, + WITH_THINK_STREAM, + {"thinking": True}, + id="with_think_stream_enabled", + ), + pytest.param( + False, + THINKING_DISABLED, + {"thinking": False}, + id="thinking_disabled", + ), + pytest.param( + True, + THINKING_DISABLED_STREAM, + {"thinking": False}, + id="thinking_disabled_stream", + ), + pytest.param( + False, + THINKING_DISABLED_WITH_CLOSE_TAG, + {"thinking": False}, + id="thinking_disabled_with_close_tag", + ), + pytest.param( + True, + THINKING_DISABLED_WITH_CLOSE_TAG_STREAM, + {"thinking": False}, + id="thinking_disabled_with_close_tag_stream", + ), + pytest.param( + False, + COMPLETE_REASONING, + None, + id="complete_reasoning", + ), + pytest.param( + True, + COMPLETE_REASONING, + None, + id="complete_reasoning_stream", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + chat_template_kwargs: dict | None, + tokenizer, +): + output = tokenizer.tokenize(param_dict["output"]) + output_tokens: list[str] = [ + tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")( + tokenizer, + chat_template_kwargs=chat_template_kwargs, + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 36e58dba6..7b918d2e3 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = { "granite_reasoning_parser", "GraniteReasoningParser", ), + "holo2": ( + "holo2_reasoning_parser", + "Holo2ReasoningParser", + ), "hunyuan_a13b": ( "hunyuan_a13b_reasoning_parser", "HunyuanA13BReasoningParser", diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py new file mode 100644 index 000000000..76de1c077 --- /dev/null +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.logger import init_logger +from vllm.reasoning import ( + ReasoningParser, +) +from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser +from vllm.tokenizers import TokenizerLike + +logger = init_logger(__name__) + + +class Holo2ReasoningParser(ReasoningParser): + """ + Reasoning parser for the Holo2 models which are based on Qwen3. + + The Holo2 model uses ... tokens to denote reasoning text but + is part of the chat template. This parser extracts the reasoning content until + in the model's output. + + The model provides a switch to enable or disable reasoning + output via the 'thinking=False' parameter. + + Chat template args: + - thinking: Whether to enable reasoning output (default: True) + + + Parsing rules on model output: + - thinking == False + -> Model output is treated as purely the content |content| + - thinking == True + -> Model output is |reasoning_content||content| + """ + + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) + + chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} + # Deepseek V3 and Holo2 are similar. However, Holo2 models think by default. + # this parser without user specified chat template args is initiated once for + # all requests in the structured output manager. So it is important that without + # user specified chat template args, the default thinking is True. + + enable_thinking = bool(chat_kwargs.get("thinking", True)) + + if enable_thinking: + self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) + else: + self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs) + + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + return self._parser.is_reasoning_end(input_ids) + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return self._parser.extract_content_ids(input_ids) + + def extract_reasoning( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[str | None, str | None]: + return self._parser.extract_reasoning(model_output, request) + + def extract_reasoning_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + return self._parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + )