[Multimodal][Core] Optimize multimodal preprocessing cache by hashing image bytes instead of pixel values (#29621)
Signed-off-by: Rahul Steiger <rasteiger@ethz.ch> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -9,6 +9,7 @@ from transformers import AutoProcessor
|
||||
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
@@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
|
||||
placeholder = "<|image_1|> "
|
||||
prompt = f"{placeholder}{content}"
|
||||
images = [fetch_image(image_url)]
|
||||
image = fetch_image(image_url)
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
images = [image]
|
||||
inputs = processor(prompt, images, return_tensors="pt")
|
||||
return inputs.input_ids.shape[1]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user