diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py index 47f841540..e9d33ba9b 100644 --- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py +++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py @@ -23,7 +23,7 @@ class TestGptOssStructuralTagsIntegration: """Create a mock tokenizer.""" tokenizer = Mock() tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5]) - tokenizer.vocab = {"<|end|>": 6} + tokenizer.get_vocab = Mock(return_value={"<|end|>": 6}) return tokenizer @pytest.fixture diff --git a/tests/v1/structured_output/test_gptoss_structural_tags.py b/tests/v1/structured_output/test_gptoss_structural_tags.py index fafa9d8ed..fb1eae53d 100644 --- a/tests/v1/structured_output/test_gptoss_structural_tags.py +++ b/tests/v1/structured_output/test_gptoss_structural_tags.py @@ -25,7 +25,7 @@ class TestGptOssReasoningParser: """Create a mock tokenizer for testing.""" tokenizer = Mock() tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5]) - tokenizer.vocab = {"<|end|>": 6} + tokenizer.get_vocab = Mock(return_value={"<|end|>": 6}) return tokenizer @pytest.fixture diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 717d9cf53..0a22494d0 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -41,7 +41,6 @@ EXCLUDE = [ # TODO: Remove these entries after fixing mypy errors. "vllm/benchmarks", "vllm/config", - "vllm/reasoning", ] diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 83c3e6b90..5271a3070 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -6,7 +6,7 @@ import os from abc import abstractmethod from collections.abc import Callable, Iterable, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from vllm.entrypoints.mcp.tool_server import ToolServer from vllm.logger import init_logger @@ -14,21 +14,10 @@ from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import import_from_path if TYPE_CHECKING: - from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, - ) - from vllm.entrypoints.openai.engine.protocol import ( - DeltaMessage, - ) - from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, - ) + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.engine.protocol import DeltaMessage + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.tokenizers import TokenizerLike -else: - ChatCompletionRequest = Any - DeltaMessage = Any - ResponsesRequest = Any - TokenizerLike = Any logger = init_logger(__name__) @@ -41,7 +30,7 @@ class ReasoningParser: It is used to extract reasoning content from the model output. """ - def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): + def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs): self.model_tokenizer = tokenizer @cached_property @@ -127,7 +116,7 @@ class ReasoningParser: def extract_reasoning( self, model_output: str, - request: ChatCompletionRequest | ResponsesRequest, + request: "ChatCompletionRequest | ResponsesRequest", ) -> tuple[str | None, str | None]: """ Extract reasoning content from a complete model-generated string. @@ -136,14 +125,10 @@ class ReasoningParser: available before sending to the client. Parameters: - model_output: str - The model-generated string to extract reasoning content from. - - request: ChatCompletionRequest - The request object that was used to generate the model_output. + model_output: The model-generated string to extract reasoning content from. + request: The request object that was used to generate the model_output. Returns: - tuple[Optional[str], Optional[str]] A tuple containing the reasoning content and the content. """ @@ -156,7 +141,7 @@ class ReasoningParser: previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], - ) -> DeltaMessage | None: + ) -> "DeltaMessage | None": """ Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index 5b1c0111c..a8bb33d2c 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -4,22 +4,15 @@ from abc import abstractmethod from collections.abc import Iterable, Sequence from itertools import islice -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.tokenizers import TokenizerLike if TYPE_CHECKING: - from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, - ) - from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, - ) -else: - ChatCompletionRequest = Any - ResponsesRequest = Any + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest class BaseThinkingReasoningParser(ReasoningParser): @@ -58,13 +51,15 @@ class BaseThinkingReasoningParser(ReasoningParser): if not self.start_token or not self.end_token: raise ValueError("start_token and end_token must be defined in subclasses") - self.start_token_id = self.vocab.get(self.start_token) - self.end_token_id = self.vocab.get(self.end_token) - if self.start_token_id is None or self.end_token_id is None: + start_token_id = self.vocab.get(self.start_token) + end_token_id = self.vocab.get(self.end_token) + if start_token_id is None or end_token_id is None: raise RuntimeError( f"{self.__class__.__name__} reasoning parser could not locate " "think start/end tokens in the tokenizer!" ) + self.start_token_id: int = start_token_id + self.end_token_id: int = end_token_id def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: start_token_id = self.start_token_id @@ -152,7 +147,7 @@ class BaseThinkingReasoningParser(ReasoningParser): return DeltaMessage(content=delta_text) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index c2efe6500..d2f7f50a3 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -2,19 +2,21 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) -from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser from .identity_reasoning_parser import IdentityReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.engine.protocol import DeltaMessage + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -32,6 +34,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): enable_thinking = bool(chat_kwargs.get("enable_thinking", False)) thinking = thinking or enable_thinking + self._parser: ReasoningParser if thinking: self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) else: @@ -49,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): return self._parser.extract_content_ids(input_ids) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: return self._parser.extract_reasoning(model_output, request) @@ -61,7 +64,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], - ) -> DeltaMessage | None: + ) -> "DeltaMessage | None": return self._parser.extract_reasoning_streaming( previous_text, current_text, diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py index 3f04876b6..593eba4ec 100644 --- a/vllm/reasoning/ernie45_reasoning_parser.py +++ b/vllm/reasoning/ernie45_reasoning_parser.py @@ -2,16 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence +from typing import TYPE_CHECKING from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -46,20 +48,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): "constructor during construction." ) - self.start_token_id = self.vocab.get(self.start_token) - self.end_token_id = self.vocab.get(self.end_token) self.response_start_token_id = self.vocab.get(self.response_start_token) self.response_end_token_id = self.vocab.get(self.response_end_token) self.newline_token_id = self.vocab.get(self.newline_token) self.parser_token_ids = [self.end_token_id, self.response_end_token_id] - if self.start_token_id is None or self.end_token_id is None: - raise RuntimeError( - "Ernie45 reasoning parser could not locate think start/end " - "tokens in the tokenizer!" - ) - def extract_reasoning_streaming( self, previous_text: str, @@ -144,7 +138,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): return DeltaMessage(reasoning=delta_text) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 599392e36..c5628a2bf 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -2,18 +2,20 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence +from typing import TYPE_CHECKING from transformers import PreTrainedTokenizerBase from vllm.entrypoints.mcp.tool_server import ToolServer -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) no_func_reaonsing_tag = { @@ -78,7 +80,7 @@ class GptOssReasoningParser(ReasoningParser): self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>") # We also need to check for the <|end|> token to avoid false positives from # previous messages in multi-turn conversations. - self.eom_token_id = self.model_tokenizer.vocab["<|end|>"] + self.eom_token_id = self.vocab["<|end|>"] self.reasoning_max_num_between_tokens = 20 def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: @@ -148,7 +150,7 @@ class GptOssReasoningParser(ReasoningParser): def extract_reasoning( self, model_output: str, - request: ChatCompletionRequest, + request: "ChatCompletionRequest | ResponsesRequest", ) -> tuple[str | None, str | None]: raise NotImplementedError( "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501 diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 5cae16f74..2d8052f61 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -2,17 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence +from typing import TYPE_CHECKING import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -53,7 +55,7 @@ class GraniteReasoningParser(ReasoningParser): ) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py index ae3b86a89..f833f8f32 100644 --- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py +++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py @@ -2,17 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence +from typing import TYPE_CHECKING import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -65,8 +67,8 @@ class HunyuanA13BReasoningParser(ReasoningParser): self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397] # when state change, send out all the buffered text in last state - self.buffered_text = [] - self.buffered_ids = [] + self.buffered_text: list[str] = [] + self.buffered_ids: list[int] = [] self.current_state = "reasoning" self.all_states = ["reasoning", "response"] @@ -76,7 +78,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): # this sequence only for the think start, it has two way to start. self.expected_sequence_side = self.think_start_ids_fast self.sequence_index = 0 - self.token_buffer = [] + self.token_buffer: list[int] = [] self.text_buffer = "" def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: @@ -90,7 +92,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): return [] def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index 3c76901a3..b02a9d318 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -2,16 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -59,7 +61,7 @@ class IdentityReasoningParser(ReasoningParser): return None def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: # No reasoning separation: return None for reasoning, # and full model_output as content diff --git a/vllm/reasoning/kimi_k2_reasoning_parser.py b/vllm/reasoning/kimi_k2_reasoning_parser.py index 8dd1a76e5..8ee05ffd2 100644 --- a/vllm/reasoning/kimi_k2_reasoning_parser.py +++ b/vllm/reasoning/kimi_k2_reasoning_parser.py @@ -1,17 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Sequence +from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + class KimiK2ReasoningParser(ReasoningParser): """ @@ -39,6 +41,7 @@ class KimiK2ReasoningParser(ReasoningParser): thinking = bool(chat_kwargs.get("thinking", True)) # If thinking is not enabled, use identity parser to fall through + self._identity_parser: IdentityReasoningParser | None if not thinking: self._identity_parser = IdentityReasoningParser(tokenizer, *args, **kwargs) else: @@ -62,10 +65,6 @@ class KimiK2ReasoningParser(ReasoningParser): "tokens in the tokenizer!" ) - def _is_identity_mode(self) -> bool: - """Check if parser is in identity mode (no reasoning extraction).""" - return self._identity_parser is not None - def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: """ Check if the reasoning content ends in the input_ids. @@ -74,7 +73,7 @@ class KimiK2ReasoningParser(ReasoningParser): 1. The end token () 2. The tool section start token (<|tool_calls_section_begin|>) """ - if self._is_identity_mode(): + if self._identity_parser is not None: return self._identity_parser.is_reasoning_end(input_ids) start_token_id = self._start_token_id @@ -95,29 +94,32 @@ class KimiK2ReasoningParser(ReasoningParser): return False def is_reasoning_end_streaming( - self, input_ids: Sequence[int], delta_ids: Sequence[int] + self, input_ids: Sequence[int], delta_ids: Iterable[int] ) -> bool: """ Check if the reasoning content ends in the input_ids on a decode step. """ - if self._is_identity_mode(): + if self._identity_parser is not None: return self._identity_parser.is_reasoning_end_streaming( input_ids, delta_ids ) + # Materialize iterable for membership checks + delta_ids_set = set(delta_ids) + # Check for explicit end token or implicit tool section start in delta - if self._end_token_id in delta_ids: + if self._end_token_id in delta_ids_set: return True return ( self._tool_section_start_token_id is not None - and self._tool_section_start_token_id in delta_ids + and self._tool_section_start_token_id in delta_ids_set ) def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ Extract content token ids from the input_ids. """ - if self._is_identity_mode(): + if self._identity_parser is not None: return self._identity_parser.extract_content_ids(input_ids) if self._end_token_id in input_ids: @@ -145,12 +147,12 @@ class KimiK2ReasoningParser(ReasoningParser): return [] def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. """ - if self._is_identity_mode(): + if self._identity_parser is not None: return self._identity_parser.extract_reasoning(model_output, request) # thinking does not require a think start token but consume it if present @@ -189,7 +191,7 @@ class KimiK2ReasoningParser(ReasoningParser): """ Extract reasoning content from a delta message during streaming. """ - if self._is_identity_mode(): + if self._identity_parser is not None: return self._identity_parser.extract_reasoning_streaming( previous_text, current_text, diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index e4deaed41..b2f3db5bb 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -2,21 +2,20 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence +from typing import TYPE_CHECKING -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ) -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) from vllm.logger import init_logger from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers import TokenizerLike +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -114,6 +113,6 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): return DeltaMessage(content=delta_text) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: return None, "" + model_output diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index c085ba4e4..7117716b6 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -3,18 +3,17 @@ from collections.abc import Sequence from functools import cached_property +from typing import TYPE_CHECKING -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers.mistral import MistralTokenizer +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -113,7 +112,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser): return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 3808b475e..9697b5004 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -8,20 +8,15 @@ from typing import TYPE_CHECKING import regex as re -if TYPE_CHECKING: - from vllm.tokenizers import TokenizerLike -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) -from vllm.entrypoints.openai.engine.protocol import ( - DeltaMessage, -) -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + from vllm.tokenizers import TokenizerLike + logger = init_logger(__name__) @@ -256,15 +251,15 @@ class Olmo3ReasoningParser(ReasoningParser): def extract_reasoning( self, model_output: str, - request: ChatCompletionRequest | ResponsesRequest, + request: "ChatCompletionRequest | ResponsesRequest", ) -> tuple[str | None, str | None]: """Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates something else, all content is considered non-reasoning content. Args: - model_output (str): Output of the model to be parsed. - request (ChatCompletionRequest | ResponsesRequest): Request being + model_output: Output of the model to be parsed. + request: Request being processed. Returns: diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index df7b22a91..9a54aa759 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence +from typing import TYPE_CHECKING -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser -from vllm.tokenizers import TokenizerLike + +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + from vllm.tokenizers import TokenizerLike class Qwen3ReasoningParser(BaseThinkingReasoningParser): @@ -34,7 +33,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): it is stripped before extraction (non-streaming) or skipped (streaming). """ - def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): + def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} @@ -53,7 +52,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): return "" def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py index d932ba8b6..5837f0673 100644 --- a/vllm/reasoning/step3_reasoning_parser.py +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -3,17 +3,19 @@ from collections.abc import Iterable, Sequence from itertools import islice +from typing import TYPE_CHECKING import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + logger = init_logger(__name__) @@ -37,12 +39,13 @@ class Step3ReasoningParser(ReasoningParser): "constructor during construction." ) - self.think_end_token_id = self.vocab.get(self.think_end_token) - if self.think_end_token_id is None: + think_end_token_id = self.vocab.get(self.think_end_token) + if think_end_token_id is None: raise RuntimeError( "Step3 reasoning parser could not locate think end " "token in the tokenizer!" ) + self.think_end_token_id: int = think_end_token_id def extract_reasoning_streaming( self, @@ -82,7 +85,7 @@ class Step3ReasoningParser(ReasoningParser): return DeltaMessage(reasoning=delta_text) def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest" ) -> tuple[str | None, str | None]: # Check if the model output contains the token if self.think_end_token not in model_output: @@ -94,10 +97,7 @@ class Step3ReasoningParser(ReasoningParser): reasoning = model_output[:end_index] # Content after token - content = model_output[end_index + len(self.think_end_token) :] - - if len(content) == 0: - content = None + content = model_output[end_index + len(self.think_end_token) :] or None return reasoning, content diff --git a/vllm/reasoning/step3p5_reasoning_parser.py b/vllm/reasoning/step3p5_reasoning_parser.py index 25e9cdb99..23a08cbe5 100644 --- a/vllm/reasoning/step3p5_reasoning_parser.py +++ b/vllm/reasoning/step3p5_reasoning_parser.py @@ -2,17 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING -from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionRequest, -) from vllm.entrypoints.openai.engine.protocol import DeltaMessage -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers import TokenizerLike +if TYPE_CHECKING: + from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest + from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + class Step3p5ReasoningParser(BaseThinkingReasoningParser): """ @@ -50,7 +49,7 @@ class Step3p5ReasoningParser(BaseThinkingReasoningParser): self, input_ids: Sequence[int], delta_ids: Iterable[int] ) -> bool: # Only examine newly generated tokens; they may contain multiple ids. - return self._is_reasoning_end_from_ids(delta_ids) + return self._is_reasoning_end_from_ids(tuple(delta_ids)) def _is_reasoning_end_from_ids(self, input_ids: Sequence[int]) -> bool: # Scan backwards to find the last special token, or . @@ -96,7 +95,7 @@ class Step3p5ReasoningParser(BaseThinkingReasoningParser): def extract_reasoning( self, model_output: str, - request: ChatCompletionRequest | ResponsesRequest, + request: "ChatCompletionRequest | ResponsesRequest", ) -> tuple[str | None, str | None]: reasoning, content = super().extract_reasoning(model_output, request) if reasoning is not None: diff --git a/vllm/tokenizers/grok2.py b/vllm/tokenizers/grok2.py index 3b984152e..61fa1107e 100644 --- a/vllm/tokenizers/grok2.py +++ b/vllm/tokenizers/grok2.py @@ -4,7 +4,7 @@ import functools import json -from collections.abc import Collection, Set +from collections.abc import Collection, Sequence, Set from pathlib import Path from typing import Any, Literal, overload @@ -348,7 +348,9 @@ class Grok2Tokenizer(TokenizerLike): tokens = self._maybe_truncate(tokens, max_length) return tokens - def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: + def decode( + self, ids: Sequence[int] | int, skip_special_tokens: bool = False + ) -> str: if isinstance(ids, int): ids = [ids] if skip_special_tokens: @@ -371,7 +373,7 @@ class Grok2Tokenizer(TokenizerLike): return [self._token_to_id.get(token, self._unk_token_id) for token in tokens] def convert_ids_to_tokens( - self, ids: list[int], skip_special_tokens: bool = False + self, ids: Sequence[int], skip_special_tokens: bool = False ) -> list[str]: tokens = [] for token_id in ids: diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 49b4272ee..95335c983 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from pathlib import Path from typing import TYPE_CHECKING, Any, cast, overload @@ -434,7 +435,9 @@ class MistralTokenizer(TokenizerLike): return_dict=False, ) - def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: + def decode( + self, ids: Sequence[int] | int, skip_special_tokens: bool = False + ) -> str: # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 # is in, directly call self.transformers_tokenizer.decode(...). if isinstance(ids, int): @@ -512,7 +515,7 @@ class MistralTokenizer(TokenizerLike): def convert_ids_to_tokens( self, - ids: list[int], + ids: Sequence[int], skip_special_tokens: bool = False, ) -> list[str]: if not skip_special_tokens: diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py index 6f091379e..74b32e60d 100644 --- a/vllm/tokenizers/protocol.py +++ b/vllm/tokenizers/protocol.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from pathlib import Path from typing import TYPE_CHECKING, Any, Protocol, overload @@ -116,12 +117,14 @@ class TokenizerLike(Protocol): def convert_tokens_to_string(self, tokens: list[str]) -> str: raise NotImplementedError - def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: + def decode( + self, ids: Sequence[int] | int, skip_special_tokens: bool = False + ) -> str: raise NotImplementedError def convert_ids_to_tokens( self, - ids: list[int], + ids: Sequence[int], skip_special_tokens: bool = False, ) -> list[str]: raise NotImplementedError