[Core] Use key-only cache for BaseMultiModalProcessor (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-27 14:19:13 +08:00
parent 8dbf6ed7be
commit 69244e67e6
29 changed files with 954 additions and 394 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,6 +22,7 @@ from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
@@ -38,7 +39,6 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                            EngineCoreRequestType,
                            ReconfigureDistributedRequest, ReconfigureRankType,
                            UtilityOutput, UtilityResult)
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -128,8 +128,9 @@ class EngineCore:
        )
        self.use_spec_decode = vllm_config.speculative_config is not None

-        self.mm_input_cache_server = MultiModalInputCacheServer(
-            vllm_config.model_config, MULTIMODAL_REGISTRY)
+        self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
+        self.mm_receiver_cache = receiver_cache_from_config(
+            vllm_config, mm_registry)

        # Setup batch queue for pipeline parallelism.
        # Batch queue for scheduled batches. This enables us to asynchronously
@@ -370,7 +371,8 @@ class EngineCore:
            logger.warning("Resetting the multi-modal cache when requests are "
                           "in progress may lead to desynced internal caches.")

-        self.mm_input_cache_server.reset()
+        if self.mm_receiver_cache is not None:
+            self.mm_receiver_cache.clear_cache()

    def reset_prefix_cache(self):
        self.scheduler.reset_prefix_cache()
@@ -435,10 +437,11 @@ class EngineCore:
            assert request.mm_kwargs is not None

            # Note on thread safety: no race condition.
-            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # `mm_receiver_cache` is reset at the end of LLMEngine init,
            # and will only accessed in the input processing thread afterwards.
-            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
-                request.mm_kwargs, request.mm_hashes)
+            if self.mm_receiver_cache is not None:
+                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
+                    request.mm_kwargs, request.mm_hashes)

        req = Request.from_engine_core_request(request,
                                               self.request_block_hasher)