[Misc] Clean up validation logic in input processor (#34144)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-11 11:29:29 +08:00
parent 066c6da6a0
commit b5dcb372e4
3 changed files with 72 additions and 86 deletions
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -20,7 +20,6 @@ def _build_input_processor(
 ) -> InputProcessor:
    model_config = ModelConfig(
        model="Qwen/Qwen2.5-VL-3B-Instruct",
-        skip_tokenizer_init=True,
        max_model_len=128,
        mm_processor_cache_gb=mm_cache_gb,
    )
--- a/vllm/multimodal/encoder_budget.py
+++ b/vllm/multimodal/encoder_budget.py
@@ -62,6 +62,7 @@ class MultiModalBudget:
            processor = mm_registry.create_processor(model_config, cache=cache)

            self.cache = cache
+            self.processor = processor
            mm_config = model_config.get_multimodal_config()
            enable_mm_embeds = mm_config is not None and mm_config.enable_mm_embeds

--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -72,13 +72,15 @@ class InputProcessor:
        self.mm_registry = mm_registry
        self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config)

-        self.mm_encoder_cache_size: int | None = None
-        if (
-            mm_registry.supports_multimodal_inputs(model_config)
-            and not model_config.skip_tokenizer_init
-        ):
+        self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config)
+        self.mm_encoder_cache_size = 0
+        self.skip_prompt_length_check = False
+        if self.supports_mm_inputs:
            mm_budget = MultiModalBudget(vllm_config, mm_registry)
            self.mm_encoder_cache_size = mm_budget.encoder_cache_size
+            self.skip_prompt_length_check = (
+                mm_budget.processor.info.skip_prompt_length_check
+            )
            mm_budget.reset_cache()  # Not used anymore

        self.input_preprocessor = InputPreprocessor(
@@ -670,76 +672,25 @@ class InputProcessor:
            resumable=resumable,
        )

-    def _validate_model_inputs(
-        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
-    ):
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs, prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs, prompt_type="decoder")
-
-    def _validate_model_input(
+    def _validate_prompt_len(
        self,
-        prompt_inputs: SingletonInputs,
-        *,
+        prompt_len: int,
        prompt_type: Literal["encoder", "decoder"],
    ):
+        if self.skip_prompt_length_check:
+            return
+
+        if prompt_len == 0 and prompt_type == "decoder":
+            raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
        model_config = self.model_config
-
-        prompt_ids = (
-            None
-            if prompt_inputs["type"] == "embeds"
-            else prompt_inputs["prompt_token_ids"]
+        max_prompt_len = (
+            model_config.max_model_len
+            if prompt_type == "decoder"
+            else self.mm_encoder_cache_size
        )
-        prompt_embeds = (
-            prompt_inputs["prompt_embeds"]
-            if prompt_inputs["type"] == "embeds"
-            else None
-        )
-        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass  # Prompt embeds should not have prompt_ids.
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        tokenizer = self.tokenizer
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids or (), default=0)
-
-            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
-            # self.model_config.get_vocab_size() is the model’s vocab size.
-            # For Qwen3 models, the language model has extra tokens that do
-            # not exist in the tokenizer, and vice versa for multimodal
-            # placeholder tokens in some multimodal models.
-            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
-            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
-
-            # Here we take the max of the two to determine if a token id is
-            # truly out-of-vocabulary.
-            if max_input_id > max(
-                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
-            ):
-                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
        if prompt_len > max_prompt_len:
-            if model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                model_cls = mm_registry._get_model_cls(model_config)
-                factories = model_cls._processor_factory
-                ctx = mm_registry._create_processing_ctx(
-                    model_config,
-                    tokenizer=tokenizer,
-                )
-                mm_info = factories.info(ctx)
-
-                if mm_info.skip_prompt_length_check:
-                    return
-
-            if model_config.is_multimodal_model:
+            if self.supports_mm_inputs:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
@@ -757,17 +708,7 @@ class InputProcessor:
                f"longer than the maximum model length of {max_prompt_len}. "
                f"{suggestion}"
            )
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-        if (
-            prompt_len == max_prompt_len
-            and prompt_type == "decoder"
-            and not model_config.is_multimodal_model
-            and self.model_config.runner_type != "pooling"
-        ):
+        elif prompt_len == max_prompt_len and model_config.runner_type == "generate":
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens (prompt + requested output tokens)."
@@ -778,11 +719,29 @@ class InputProcessor:
                f"model length of {max_prompt_len}. {suggestion}"
            )

-        if (
-            prompt_type == "decoder"
-            and prompt_inputs["type"] == "multimodal"
-            and self.mm_encoder_cache_size is not None
-        ):
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        prompt_type: Literal["encoder", "decoder"],
+    ) -> None:
+        model_config = self.model_config
+        tokenizer = self.tokenizer
+
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
+        self._validate_prompt_len(prompt_len, prompt_type)
+
+        if prompt_inputs["type"] == "multimodal":
            decoder_mm_positions = prompt_inputs["mm_placeholders"]
            for modality, mm_positions in decoder_mm_positions.items():
                for mm_position in mm_positions:
@@ -797,6 +756,33 @@ class InputProcessor:
                            f"by setting --limit-mm-per-prompt at startup."
                        )

+        if prompt_ids and tokenizer is not None:
+            max_input_id = max(prompt_ids, default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            model_vocab_size = model_config.get_vocab_size()
+            if max_input_id > max(tokenizer.max_token_id, model_vocab_size - 1):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
+
+    def _validate_model_inputs(
+        self,
+        encoder_inputs: SingletonInputs | None,
+        decoder_inputs: SingletonInputs,
+    ):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
+
    def stat_mm_cache(self) -> MultiModalCacheStats | None:
        return self.input_preprocessor.stat_mm_cache()