[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
2025-08-27 13:24:31 -07:00
parent 0585a9e73c
commit 8bf6266a17
12 changed files with 179 additions and 24 deletions
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -290,6 +290,7 @@ class DeepseekVL2MultiModalProcessor(
        mm_data_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
        # The processor logic is different for len(images) <= 2 vs > 2
        # Since the processing cache assumes that the processor output is
@@ -301,6 +302,7 @@ class DeepseekVL2MultiModalProcessor(
                mm_data_items=mm_data_items,
                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
            )

        return super()._cached_apply_hf_processor(
@@ -308,6 +310,7 @@ class DeepseekVL2MultiModalProcessor(
            mm_data_items=mm_data_items,
            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
        )