diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 971f353a3..90e65d059 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -49,6 +49,7 @@ FILES = [ "vllm/v1/metrics", "vllm/v1/pool", "vllm/v1/sample", + "vllm/v1/structured_output", "vllm/v1/worker", ] @@ -64,7 +65,6 @@ SEPARATE_GROUPS = [ # v1 related "vllm/v1/kv_offload", "vllm/v1/spec_decode", - "vllm/v1/structured_output", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index f81267dab..bd13ecf02 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -51,7 +51,7 @@ class ReasoningParser: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: """ Check if the reasoning content ends in the input_ids. @@ -68,7 +68,7 @@ class ReasoningParser: """ def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: """ Check if the reasoning content ends in the input_ids on a diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index a854ea79a..18bf96d78 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -65,7 +65,7 @@ class BaseThinkingReasoningParser(ReasoningParser): "think start/end tokens in the tokenizer!" ) - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: start_token_id = self.start_token_id end_token_id = self.end_token_id @@ -77,7 +77,7 @@ class BaseThinkingReasoningParser(ReasoningParser): return False def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: end_token_id = self.end_token_id return end_token_id in delta_ids diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index 2db39e16c..7b31d88b7 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -41,7 +41,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): return self._parser.is_reasoning_end(input_ids) def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 0b54020b2..186c4e5c7 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -78,7 +78,7 @@ class GptOssReasoningParser(ReasoningParser): self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>") self.reasoning_max_num_between_tokens = 20 - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: end_token_ids_prefix = self.reasoning_end_token_ids_prefix end_token_ids_suffix = self.reasoning_end_token_ids_suffix assert len(end_token_ids_prefix) > 0, "reasoning_end_token_ids_prefix is empty" diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index b0bda0979..bf2e6f139 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -61,7 +61,7 @@ class Holo2ReasoningParser(ReasoningParser): return self._parser.is_reasoning_end(input_ids) def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py index 05e4d5862..ae3b86a89 100644 --- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py +++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py @@ -79,7 +79,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): self.token_buffer = [] self.text_buffer = "" - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self.current_state == "response" def extract_content_ids(self, input_ids: list[int]) -> list[int]: diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index ad4e0c8ff..e1106362d 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -31,12 +31,12 @@ class IdentityReasoningParser(ReasoningParser): "constructor during construction." ) - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: # Always return True, since we never treat reasoning specially return True def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: return True diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 3d7942be5..d0333a76b 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -88,7 +88,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): super().__init__(tokenizer, *args, **kwargs) self.end_token_id = self.vocab.get("") - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: end_token_id = self.end_token_id return any(input_id == end_token_id for input_id in reversed(input_ids)) diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index 05dcbac5c..790f4b736 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from functools import cached_property from vllm.entrypoints.openai.chat_completion.protocol import ( @@ -65,7 +66,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser): return SpecialTokens.end_think - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: has_eot_token = False for id in input_ids[::-1]: diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 89db4881e..9126c4d77 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -242,7 +242,7 @@ class Olmo3ReasoningParser(ReasoningParser): think_start=self.think_start, think_end=self.think_end ) - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: text = self.model_tokenizer.decode(input_ids) return self.think_end in text diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py index 47131b2bc..4758246ac 100644 --- a/vllm/reasoning/step3_reasoning_parser.py +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -100,11 +100,11 @@ class Step3ReasoningParser(ReasoningParser): return reasoning, content - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self.think_end_token_id in input_ids def is_reasoning_end_streaming( - self, input_ids: list[int], delta_ids: list[int] + self, input_ids: Sequence[int], delta_ids: Sequence[int] ) -> bool: end_token_id = self.think_end_token_id return end_token_id in delta_ids diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index ea1a10f6a..988cf7c27 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + from vllm.logger import init_logger from vllm.platforms import current_platform @@ -17,15 +19,15 @@ elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops reshape_and_cache_flash = ipex_ops.reshape_and_cache_flash - flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func - get_scheduler_metadata = ipex_ops.get_scheduler_metadata + flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment] + get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment] elif current_platform.is_rocm(): try: - from flash_attn import flash_attn_varlen_func # noqa: F401 + from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] except ImportError: - def flash_attn_varlen_func(*args, **kwargs): + def flash_attn_varlen_func(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-redef,misc] raise ImportError( "ROCm platform requires upstream flash-attn " "to be installed. Please install flash-attn first." diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py index b164bb7b2..5b6ecb65c 100644 --- a/vllm/v1/attention/backends/mla/aiter_triton_mla.py +++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py @@ -49,7 +49,7 @@ class AiterTritonMLAImpl(AiterMLAImpl): def _flash_attn_varlen_diff_headdims( self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs ): - result = self.flash_attn_varlen_func( + result = self.flash_attn_varlen_func( # type: ignore[call-arg] q, k, v, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 46ca97cac..3abf8ad30 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -230,7 +230,7 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): def _flash_attn_varlen_diff_headdims( self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs ): - output = self.flash_attn_varlen_func( + output = self.flash_attn_varlen_func( # type: ignore[call-arg] q=q, k=k, v=v, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 4c1d38110..931739245 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -294,7 +294,7 @@ class StructuredOutputManager: assert request.structured_output_request is not None if request.structured_output_request.reasoning_ended is None: request.structured_output_request.reasoning_ended = ( - self.reasoner.is_reasoning_end(request.prompt_token_ids) + self.reasoner.is_reasoning_end(request.prompt_token_ids or []) ) return request.structured_output_request.reasoning_ended return True @@ -323,8 +323,9 @@ class StructuredOutputManager: # Check if reasoning ends in *this* step delta_from = request.num_computed_tokens - request.num_output_placeholders + all_token_ids = request.all_token_ids if self.reasoner.is_reasoning_end_streaming( - request.all_token_ids, request.all_token_ids[delta_from:] + all_token_ids, all_token_ids[delta_from:] ): # Reasoning just ended, so we shouldn't advance til # next pass diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 727a67333..6d1e4b7f7 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -284,6 +284,9 @@ def serialize_guidance_grammar( def validate_guidance_grammar( sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None ) -> None: + # if structured output is not enabled, there is nothing to validate + if sampling_params.structured_outputs is None: + return tp, grm = get_structured_output_key(sampling_params.structured_outputs) guidance_grm = serialize_guidance_grammar(tp, grm) err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer) diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 9dd506880..617132577 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -69,7 +69,7 @@ class XgrammarBackend(StructuredOutputBackend): if idx < vocab_size: encoded_vocab[idx] = token stop_token_ids = [self.tokenizer.eos_token_id] - backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str() + backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str() # type: ignore[attr-defined] metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str) tokenizer_info = xgr.TokenizerInfo( encoded_vocab=encoded_vocab,