[Core] Store only the keys for multi-modal data in P0 (#22198)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 16:45:04 +08:00
parent 289b18e670
commit 766bc8162c
17 changed files with 325 additions and 234 deletions
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -19,7 +19,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
    validate_guidance_grammar)
 from vllm.v1.structured_output.backend_outlines import (
@@ -50,11 +50,8 @@ class Processor:
                                                    self.tokenizer,
                                                    mm_registry)

-        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
-
-        # Multi-modal hasher (for images)
-        self.use_hash = self.mm_input_cache_client.use_cache or \
-            self.cache_config.enable_prefix_caching
+        self.mm_input_cache_client = MultiModalInputCacheClient(
+            self.model_config)

    @property
    def mm_registry(self):
@@ -256,11 +253,13 @@ class Processor:
        # 1. Tokenize text prompt, with LoRA request if one exists.
        # 2. For multimodal models with a merged preprocessor, preprocess
        #   multimodal data and expand prompt token ids accordingly.
+        return_mm_hashes = (self.model_config.processor_return_mm_hashes
+                            or bool(self.cache_config.enable_prefix_caching))
        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
            prompt,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request,
-            return_mm_hashes=self.use_hash,
+            return_mm_hashes=return_mm_hashes,
        )
        from vllm.platforms import current_platform
        current_platform.validate_request(
@@ -312,7 +311,7 @@ class Processor:
                sorted_mm_hashes,
            ) = merge_and_sort_multimodal_metadata(
                decoder_inputs["mm_placeholders"],
-                decoder_inputs["mm_hashes"] if self.use_hash else None,
+                decoder_inputs["mm_hashes"] if return_mm_hashes else None,
            )

            # The output of merged multi-modal processor (`decoder_mm_inputs`)
@@ -339,7 +338,7 @@ class Processor:
                ]

            if sorted_mm_hashes is not None:
-                sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
+                sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
                    orig_sorted_mm_inputs, sorted_mm_hashes)
            else:
                sorted_mm_inputs = orig_sorted_mm_inputs