[Core] Shared memory based object store for Multimodal data caching and IPC (#20452)

Signed-off-by: donglu <donglu@cohere.com>
2025-09-12 10:54:17 -04:00
parent 9f04d9d55f
commit a5b84f1cbf
17 changed files with 1487 additions and 27 deletions
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -14,6 +14,7 @@ from enum import Enum, auto
 from functools import partial
 from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
+from multiprocessing.synchronize import Lock as LockType
 from threading import Thread
 from typing import Any, Callable, Optional, Union, cast

@@ -31,10 +32,13 @@ from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
 from vllm.executor.multiproc_worker_utils import (
    set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import worker_receiver_cache_from_config
 from vllm.utils import (decorate_logs, get_distributed_init_method,
                        get_loopback_ip, get_mp_context, get_open_port,
                        set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.executor.utils import get_and_update_mm_cache
 from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
                             ModelRunnerOutput)
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -81,6 +85,8 @@ class MultiprocExecutor(Executor):
        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()

        # Create workers
+        context = get_mp_context()
+        shared_worker_lock = context.Lock()
        unready_workers: list[UnreadyWorkerProcHandle] = []
        success = False
        try:
@@ -92,6 +98,7 @@ class MultiprocExecutor(Executor):
                        rank=rank,
                        distributed_init_method=distributed_init_method,
                        input_shm_handle=scheduler_output_handle,
+                        shared_worker_lock=shared_worker_lock,
                    ))

            # Workers must be created before wait_for_ready to avoid
@@ -380,6 +387,7 @@ class WorkerProc:
        rank: int,
        distributed_init_method: str,
        input_shm_handle: Handle,
+        shared_worker_lock: LockType,
    ):
        self.rank = rank
        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
@@ -416,6 +424,10 @@ class WorkerProc:
                name="WorkerAsyncOutputCopy")
            self.async_output_copy_thread.start()

+        # Initialize multimodal receiver cache if needed
+        self.mm_receiver_cache = worker_receiver_cache_from_config(
+            vllm_config, MULTIMODAL_REGISTRY, shared_worker_lock)
+
        # Initialize device
        self.worker.init_device()

@@ -428,11 +440,12 @@ class WorkerProc:

    @staticmethod
    def make_worker_process(
-            vllm_config: VllmConfig,
-            local_rank: int,
-            rank: int,
-            distributed_init_method: str,
-            input_shm_handle,  # Receive SchedulerOutput
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle,  # Receive SchedulerOutput
+        shared_worker_lock: LockType,
    ) -> UnreadyWorkerProcHandle:
        context = get_mp_context()
        # (reader, writer)
@@ -449,6 +462,7 @@ class WorkerProc:
            "input_shm_handle": input_shm_handle,
            "ready_pipe": (reader, writer),
            "death_pipe": death_reader,
+            "shared_worker_lock": shared_worker_lock,
        }
        # Run EngineCore busy loop in background process.
        proc = context.Process(target=WorkerProc.worker_main,
@@ -646,6 +660,10 @@ class WorkerProc:
                    func = getattr(self.worker, method)
                elif isinstance(method, bytes):
                    func = partial(cloudpickle.loads(method), self.worker)
+                # retrieve from shm cache if available
+                if self.mm_receiver_cache is not None \
+                    and func.__name__ == "execute_model":
+                    get_and_update_mm_cache(self.mm_receiver_cache, args)
                output = func(*args, **kwargs)
            except Exception as e:
                # Notes have been introduced in python 3.11