Add tensor IPC transfer mechanism for multimodal data (#32104)

Signed-off-by: Brandon Pelfrey <bpelfrey@nvidia.com> Signed-off-by: Brandon Pelfrey <brandonpelfrey@gmail.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-03-21 13:10:20 -07:00
parent 61e381dcf0
commit 80b70884eb
13 changed files with 1430 additions and 25 deletions
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -10,6 +10,7 @@ from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing import Process, connection
 from multiprocessing.process import BaseProcess
+from multiprocessing.queues import Queue
 from typing import TYPE_CHECKING
 from unittest.mock import patch

@@ -95,6 +96,7 @@ class CoreEngineProcManager:
        executor_class: type[Executor],
        log_stats: bool,
        client_handshake_address: str | None = None,
+        tensor_queue: Queue | None = None,
    ):
        context = get_mp_context()
        common_kwargs = {
@@ -103,6 +105,7 @@ class CoreEngineProcManager:
            "handshake_address": handshake_address,
            "executor_class": executor_class,
            "log_stats": log_stats,
+            "tensor_queue": tensor_queue,
        }

        if client_handshake_address:
@@ -864,6 +867,7 @@ def launch_core_engines(
        CoreEngineProcManager | CoreEngineActorManager | None,
        DPCoordinator | None,
        EngineZmqAddresses,
+        Queue | None,
    ]
 ]:
    """Launch engine and DP coordinator processes as needed."""
@@ -878,6 +882,14 @@ def launch_core_engines(

    offline_mode = local_start_index is not None

+    # Create a single tensor IPC queue for sharing multimodal tensors between
+    # API servers and engine core. Returns a single queue since we only support
+    # DP=1 for this data flow.
+    tensor_queue: Queue | None = None
+    multimodal_config = vllm_config.model_config.multimodal_config
+    if multimodal_config is not None and multimodal_config.mm_tensor_ipc == "torch_shm":
+        tensor_queue = get_mp_context().Queue()
+
    # Run the DP Coordinator process with rank 0 when in online DP mode.
    # The coordinator is needed for:
    # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
@@ -913,7 +925,7 @@ def launch_core_engines(
            log_stats=log_stats,
        )

-        yield engine_actor_manager, coordinator, addresses
+        yield engine_actor_manager, coordinator, addresses, tensor_queue
        return

    if offline_mode:
@@ -975,11 +987,12 @@ def launch_core_engines(
                local_engine_count=local_engine_count,
                start_index=dp_rank,
                local_start_index=local_start_index or 0,
+                tensor_queue=tensor_queue,
            )
        else:
            local_engine_manager = None

-        yield local_engine_manager, coordinator, addresses
+        yield local_engine_manager, coordinator, addresses, tensor_queue

        # Now wait for engines to start.
        wait_for_engine_startup(