[Multimodal][Core] Optimize multimodal preprocessing cache by hashing image bytes instead of pixel values (#29621)

Signed-off-by: Rahul Steiger <rasteiger@ethz.ch>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
ImaGoodFella
2025-12-02 14:49:02 +01:00
committed by GitHub
parent 68ffbca7e4
commit 60c3d413af
8 changed files with 95 additions and 19 deletions

View File

@@ -9,6 +9,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
images = [fetch_image(image_url)]
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]