[Bugfix] Avoid transferring cached multi-modal items from P0 to P1 (#16273)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-09 15:51:27 +08:00
parent 24f6b9a713
commit e484e02857
5 changed files with 65 additions and 22 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -31,7 +31,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                            EngineCoreRequestType, UtilityOutput)
-from vllm.v1.engine.mm_input_cache import MMInputCacheServer
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -105,7 +105,7 @@ class EngineCore:
        )

        # Setup MM Input Mapper.
-        self.mm_input_cache_server = MMInputCacheServer(
+        self.mm_input_cache_server = MirroredProcessingCache(
            vllm_config.model_config)

        # Setup batch queue for pipeline parallelism.
@@ -173,7 +173,7 @@ class EngineCore:
            # anything that has a hash must have a HIT cache entry here
            # as well.
            assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
                request.mm_inputs, request.mm_hashes)

        req = Request.from_engine_core_request(request)