[Multimodal][Core] Optimize multimodal preprocessing cache by hashing image bytes instead of pixel values (#29621)

Signed-off-by: Rahul Steiger <rasteiger@ethz.ch>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
ImaGoodFella
2025-12-02 14:49:02 +01:00
committed by GitHub
parent 68ffbca7e4
commit 60c3d413af
8 changed files with 95 additions and 19 deletions

View File

@@ -59,6 +59,7 @@ from vllm.distributed import (
)
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
@@ -1389,7 +1390,11 @@ class LocalAssetServer:
return f"{self.base_url}/{name}"
def get_image_asset(self, name: str) -> Image.Image:
return fetch_image(self.url_for(name))
image = fetch_image(self.url_for(name))
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
return image
@pytest.fixture(scope="session")