diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 4f3dbdf29..4170de173 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -20,7 +20,6 @@ def _build_input_processor( ) -> InputProcessor: model_config = ModelConfig( model="Qwen/Qwen2.5-VL-3B-Instruct", - skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, ) diff --git a/vllm/multimodal/encoder_budget.py b/vllm/multimodal/encoder_budget.py index 821c9e9b5..c51bb255d 100644 --- a/vllm/multimodal/encoder_budget.py +++ b/vllm/multimodal/encoder_budget.py @@ -62,6 +62,7 @@ class MultiModalBudget: processor = mm_registry.create_processor(model_config, cache=cache) self.cache = cache + self.processor = processor mm_config = model_config.get_multimodal_config() enable_mm_embeds = mm_config is not None and mm_config.enable_mm_embeds diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 47180ee59..0e52e2d20 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -72,13 +72,15 @@ class InputProcessor: self.mm_registry = mm_registry self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config) - self.mm_encoder_cache_size: int | None = None - if ( - mm_registry.supports_multimodal_inputs(model_config) - and not model_config.skip_tokenizer_init - ): + self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config) + self.mm_encoder_cache_size = 0 + self.skip_prompt_length_check = False + if self.supports_mm_inputs: mm_budget = MultiModalBudget(vllm_config, mm_registry) self.mm_encoder_cache_size = mm_budget.encoder_cache_size + self.skip_prompt_length_check = ( + mm_budget.processor.info.skip_prompt_length_check + ) mm_budget.reset_cache() # Not used anymore self.input_preprocessor = InputPreprocessor( @@ -670,76 +672,25 @@ class InputProcessor: resumable=resumable, ) - def _validate_model_inputs( - self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs - ): - if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, prompt_type="encoder") - - self._validate_model_input(decoder_inputs, prompt_type="decoder") - - def _validate_model_input( + def _validate_prompt_len( self, - prompt_inputs: SingletonInputs, - *, + prompt_len: int, prompt_type: Literal["encoder", "decoder"], ): + if self.skip_prompt_length_check: + return + + if prompt_len == 0 and prompt_type == "decoder": + raise ValueError(f"The {prompt_type} prompt cannot be empty") + model_config = self.model_config - - prompt_ids = ( - None - if prompt_inputs["type"] == "embeds" - else prompt_inputs["prompt_token_ids"] + max_prompt_len = ( + model_config.max_model_len + if prompt_type == "decoder" + else self.mm_encoder_cache_size ) - prompt_embeds = ( - prompt_inputs["prompt_embeds"] - if prompt_inputs["type"] == "embeds" - else None - ) - prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds) - if not prompt_ids: - if prompt_type == "encoder" and model_config.is_multimodal_model: - pass # Mllama may have empty encoder inputs for text-only data - elif prompt_inputs["type"] == "embeds": - pass # Prompt embeds should not have prompt_ids. - else: - raise ValueError(f"The {prompt_type} prompt cannot be empty") - - tokenizer = self.tokenizer - if tokenizer is not None: - max_input_id = max(prompt_ids or (), default=0) - - # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while - # self.model_config.get_vocab_size() is the model’s vocab size. - # For Qwen3 models, the language model has extra tokens that do - # not exist in the tokenizer, and vice versa for multimodal - # placeholder tokens in some multimodal models. - # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501 - # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501 - - # Here we take the max of the two to determine if a token id is - # truly out-of-vocabulary. - if max_input_id > max( - tokenizer.max_token_id, self.model_config.get_vocab_size() - 1 - ): - raise ValueError(f"Token id {max_input_id} is out of vocabulary") - - max_prompt_len = self.model_config.max_model_len if prompt_len > max_prompt_len: - if model_config.is_multimodal_model: - mm_registry = self.input_preprocessor.mm_registry - model_cls = mm_registry._get_model_cls(model_config) - factories = model_cls._processor_factory - ctx = mm_registry._create_processing_ctx( - model_config, - tokenizer=tokenizer, - ) - mm_info = factories.info(ctx) - - if mm_info.skip_prompt_length_check: - return - - if model_config.is_multimodal_model: + if self.supports_mm_inputs: suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens plus multimodal tokens. For image " @@ -757,17 +708,7 @@ class InputProcessor: f"longer than the maximum model length of {max_prompt_len}. " f"{suggestion}" ) - - # TODO: Find out how many placeholder tokens are there so we can - # check that chunked prefill does not truncate them - # max_batch_len = self.scheduler_config.max_num_batched_tokens - - if ( - prompt_len == max_prompt_len - and prompt_type == "decoder" - and not model_config.is_multimodal_model - and self.model_config.runner_type != "pooling" - ): + elif prompt_len == max_prompt_len and model_config.runner_type == "generate": suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens (prompt + requested output tokens)." @@ -778,11 +719,29 @@ class InputProcessor: f"model length of {max_prompt_len}. {suggestion}" ) - if ( - prompt_type == "decoder" - and prompt_inputs["type"] == "multimodal" - and self.mm_encoder_cache_size is not None - ): + def _validate_model_input( + self, + prompt_inputs: SingletonInputs, + prompt_type: Literal["encoder", "decoder"], + ) -> None: + model_config = self.model_config + tokenizer = self.tokenizer + + prompt_ids = ( + None + if prompt_inputs["type"] == "embeds" + else prompt_inputs["prompt_token_ids"] + ) + prompt_embeds = ( + prompt_inputs["prompt_embeds"] + if prompt_inputs["type"] == "embeds" + else None + ) + + prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds) + self._validate_prompt_len(prompt_len, prompt_type) + + if prompt_inputs["type"] == "multimodal": decoder_mm_positions = prompt_inputs["mm_placeholders"] for modality, mm_positions in decoder_mm_positions.items(): for mm_position in mm_positions: @@ -797,6 +756,33 @@ class InputProcessor: f"by setting --limit-mm-per-prompt at startup." ) + if prompt_ids and tokenizer is not None: + max_input_id = max(prompt_ids, default=0) + + # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while + # self.model_config.get_vocab_size() is the model’s vocab size. + # For Qwen3 models, the language model has extra tokens that do + # not exist in the tokenizer, and vice versa for multimodal + # placeholder tokens in some multimodal models. + # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501 + # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501 + + # Here we take the max of the two to determine if a token id is + # truly out-of-vocabulary. + model_vocab_size = model_config.get_vocab_size() + if max_input_id > max(tokenizer.max_token_id, model_vocab_size - 1): + raise ValueError(f"Token id {max_input_id} is out of vocabulary") + + def _validate_model_inputs( + self, + encoder_inputs: SingletonInputs | None, + decoder_inputs: SingletonInputs, + ): + if encoder_inputs is not None: + self._validate_model_input(encoder_inputs, prompt_type="encoder") + + self._validate_model_input(decoder_inputs, prompt_type="decoder") + def stat_mm_cache(self) -> MultiModalCacheStats | None: return self.input_preprocessor.stat_mm_cache()