[Multimodal][Core] Optimize multimodal preprocessing cache by hashing image bytes instead of pixel values (#29621)

Signed-off-by: Rahul Steiger <rasteiger@ethz.ch> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-02 14:49:02 +01:00
parent 68ffbca7e4
commit 60c3d413af
8 changed files with 95 additions and 19 deletions
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -8,6 +8,7 @@ import pytest
 import pytest_asyncio
 from transformers import AutoProcessor

+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_base64, fetch_image

 from ...utils import RemoteOpenAIServer
@@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
            "content": f"{placeholder}{content}",
        }
    ]
-    images = [fetch_image(image_url)]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]

    prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True