diff --git a/vllm/config/model.py b/vllm/config/model.py index a730aa8ad..f080803f4 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1579,14 +1579,14 @@ class ModelConfig: @property def is_hybrid(self) -> bool: + if not self._model_info.is_hybrid: + return False # Handle granite-4.0-micro case which uses hybrid config but does not # actually contain any non-attention layers. layer_types = getattr(self.hf_config, "layer_types", None) - if layer_types is not None and all( + return layer_types is None or not all( layer == "attention" for layer in layer_types - ): - return False - return self._model_info.is_hybrid + ) @property def has_noops(self) -> bool: diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index e9eaaa492..fcc4ad826 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -2005,7 +2005,6 @@ class OpenAIServingResponses(OpenAIServing): return event async with AsyncExitStack() as exit_stack: - processer = None if self.use_harmony: # TODO: in streaming, we noticed this bug: # https://github.com/vllm-project/vllm/issues/25697 diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index ca2e27fa4..79ba31dc0 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -44,11 +44,8 @@ class RenderConfig: def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None: """Validate and normalize `truncate_prompt_tokens` parameter.""" truncate_prompt_tokens = self.truncate_prompt_tokens - if truncate_prompt_tokens is None: - return None - - if truncate_prompt_tokens == 0: - return 0 + if truncate_prompt_tokens is None or truncate_prompt_tokens == 0: + return truncate_prompt_tokens if truncate_prompt_tokens < 0: truncate_prompt_tokens = model_config.max_model_len diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0372b06d0..43b5fa5ad 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -686,11 +686,7 @@ class InputPreprocessor: mm_uuids: MultiModalUUIDDict | None = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" - res = self._preprocess( - prompt, - tokenization_kwargs, - mm_uuids=mm_uuids, - ) + res = self._preprocess(prompt, tokenization_kwargs, mm_uuids=mm_uuids) if self.mm_processor_cache and self.mm_cache_stats is not None: delta = self.mm_processor_cache.make_stats(delta=True) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 5617e04a8..afd782870 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -171,10 +171,7 @@ class PlaceholderRange: @cached_property def embeds_cumsum(self) -> torch.Tensor | None: - if self.is_embed is None: - return None - - return self.is_embed.cumsum(dim=0) + return None if self.is_embed is None else self.is_embed.cumsum(dim=0) @cached_property def get_num_embeds(self) -> int: @@ -308,13 +305,7 @@ def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> boo Equality check between [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects. """ - for k in a: - if k not in b: - return False - if not nested_tensors_equal(a[k], b[k]): - return False - - return True + return all(k in b and nested_tensors_equal(a[k], b[k]) for k in a) @dataclass diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5f8883c16..7fbce2da8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -487,10 +487,8 @@ class EngineCore: request_ids = [] while not self.aborts_queue.empty(): ids = self.aborts_queue.get_nowait() - if isinstance(ids, str): - # Should be a list here, but also handle string just in case. - ids = (ids,) - request_ids.extend(ids) + # Should be a list here, but also handle string just in case. + request_ids.extend((ids,) if isinstance(ids, str) else ids) # More efficient to abort all as a single batch. self.abort_requests(request_ids) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index ea152446e..7cee1ead7 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -618,7 +618,7 @@ class InputProcessor: tokenizer = self.tokenizer if tokenizer is not None: - max_input_id = max(prompt_ids or [], default=0) + max_input_id = max(prompt_ids or (), default=0) # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while # self.model_config.get_vocab_size() is the model’s vocab size. diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e8717e151..13b332533 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -339,10 +339,7 @@ class RequestState: stop_reason=stop_reason if finished else None, ) - def _new_pooling_output( - self, - pooling_output: torch.Tensor, - ) -> PoolingOutput: + def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput: return PoolingOutput(data=pooling_output) @@ -695,9 +692,7 @@ class OutputProcessor: assert req_state.stats is not None iteration_stats.update_from_finished_request( finish_reason=finish_reason, - num_prompt_tokens=length_from_prompt_token_ids_or_embeds( - req_state.prompt_token_ids, req_state.prompt_embeds - ), + num_prompt_tokens=req_state.prompt_len, max_tokens_param=req_state.max_tokens_param, req_stats=req_state.stats, num_cached_tokens=req_state.num_cached_tokens, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 649875fe8..7b5c28eeb 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -695,7 +695,7 @@ class WorkerProc: worker = None # tuple[Connection, Connection] reader, ready_writer = kwargs.pop("ready_pipe") - death_pipe = kwargs.pop("death_pipe", None) + death_pipe: Connection | None = kwargs.pop("death_pipe", None) shutdown_event = threading.Event() # Start death monitoring thread if death_pipe is provided if death_pipe is not None: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index f33059b80..5aaef8eb6 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -211,8 +211,7 @@ class Request: def get_num_encoder_embeds(self, input_id: int) -> int: assert input_id < len(self.mm_features) - num_embeds = self.mm_features[input_id].mm_position.get_num_embeds - return num_embeds + return self.mm_features[input_id].mm_position.get_num_embeds def record_event( self, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c7655fe04..7fab7050c 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools import multiprocessing +from collections.abc import Iterable from concurrent.futures import Future, ThreadPoolExecutor from typing import TYPE_CHECKING @@ -172,7 +174,7 @@ class StructuredOutputManager: def _fill_bitmasks( self, - batch: list[tuple[StructuredOutputGrammar, int, bool]], + batch: Iterable[tuple[StructuredOutputGrammar, int, bool]], ) -> None: assert self._grammar_bitmask is not None for grammar, index, apply_bitmask in batch: @@ -265,16 +267,16 @@ class StructuredOutputManager: apply_bitmask = self.should_fill_bitmask(request) state_advancements = 0 - req_tokens = scheduled_spec_decode_tokens.get(req_id, []) - for i, token in enumerate(req_tokens + [None]): + req_tokens = scheduled_spec_decode_tokens.get(req_id, ()) + for token in itertools.chain(req_tokens, (None,)): self._fill_bitmasks( - [ + ( ( structured_output_request.grammar, cumulative_index, apply_bitmask, - ) - ] + ), + ) ) if ( diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 94ae36a1a..b921a71b3 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -28,12 +28,9 @@ class StructuredOutputRequest: if sampling_params is None: return None params = sampling_params.structured_outputs - if params: - if params.all_constraints_none(): - return None - else: - return StructuredOutputRequest(params=params) - return None + if not params or params.all_constraints_none(): + return None + return StructuredOutputRequest(params=params) def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 08b595845..14bbd6578 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -829,7 +829,7 @@ class InputBatch: presence_penalties=self.presence_penalties[:num_reqs], repetition_penalties=self.repetition_penalties[:num_reqs], output_token_ids=output_token_ids, - spec_token_ids=cast(list[list[int]], self.spec_token_ids), + spec_token_ids=self.spec_token_ids, no_penalties=self.no_penalties, allowed_token_ids_mask=allowed_token_ids_mask, bad_words_token_ids=self.bad_words_token_ids, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 17dfcae59..3a35e1b67 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1026,7 +1026,7 @@ class GPUModelRunner( each sequence, and a shifting is done during the next iteration based on the number of accepted tokens. """ - if not self.model_config.is_hybrid or not self.speculative_config: + if not self.speculative_config or not self.model_config.is_hybrid: return # Find the number of accepted tokens for each sequence.