[Renderer] Move InputPreprocessor into Renderer (1/2) (#34510)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-15 02:14:21 +08:00
parent b3c14229b0
commit 73391a1baa
39 changed files with 456 additions and 458 deletions
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -194,14 +194,23 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
        *,
        mm_uuids: MultiModalUUIDDict | None = None,
    ) -> MultiModalInputs:
-        if prompt and mm_items:
-            raise ValueError(
-                "Siglip accepts text-only or image-only inputs, not both! "
-                "Image-only inputs means passing an image with an empty text "
-                "prompt."
-            )
-
        if mm_items:
+            if isinstance(prompt, str):
+                if len(prompt) > 0:
+                    raise ValueError(
+                        "SigLIP accepts text-only or image-only inputs, not both! "
+                        "You must pass an image with an empty text prompt."
+                    )
+            else:
+                special_tokens = self.info.get_tokenizer().all_special_ids
+                if all(tok in special_tokens for tok in prompt):
+                    prompt = []
+                else:
+                    raise ValueError(
+                        "SigLIP accepts text-only or image-only inputs, not both! "
+                        "You must pass an image with an empty token prompt."
+                    )
+
            # For multi-modal data, the prompt after processing should
            # only contain the image token
            tokenization_kwargs = {