Add tensor IPC transfer mechanism for multimodal data (#32104)

Signed-off-by: Brandon Pelfrey <bpelfrey@nvidia.com>
Signed-off-by: Brandon Pelfrey <brandonpelfrey@gmail.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Brandon Pelfrey
2026-03-21 13:10:20 -07:00
committed by GitHub
parent 61e381dcf0
commit 80b70884eb
13 changed files with 1430 additions and 25 deletions

View File

@@ -10,6 +10,7 @@ from dataclasses import dataclass
from enum import Enum, auto
from multiprocessing import Process, connection
from multiprocessing.process import BaseProcess
from multiprocessing.queues import Queue
from typing import TYPE_CHECKING
from unittest.mock import patch
@@ -95,6 +96,7 @@ class CoreEngineProcManager:
executor_class: type[Executor],
log_stats: bool,
client_handshake_address: str | None = None,
tensor_queue: Queue | None = None,
):
context = get_mp_context()
common_kwargs = {
@@ -103,6 +105,7 @@ class CoreEngineProcManager:
"handshake_address": handshake_address,
"executor_class": executor_class,
"log_stats": log_stats,
"tensor_queue": tensor_queue,
}
if client_handshake_address:
@@ -864,6 +867,7 @@ def launch_core_engines(
CoreEngineProcManager | CoreEngineActorManager | None,
DPCoordinator | None,
EngineZmqAddresses,
Queue | None,
]
]:
"""Launch engine and DP coordinator processes as needed."""
@@ -878,6 +882,14 @@ def launch_core_engines(
offline_mode = local_start_index is not None
# Create a single tensor IPC queue for sharing multimodal tensors between
# API servers and engine core. Returns a single queue since we only support
# DP=1 for this data flow.
tensor_queue: Queue | None = None
multimodal_config = vllm_config.model_config.multimodal_config
if multimodal_config is not None and multimodal_config.mm_tensor_ipc == "torch_shm":
tensor_queue = get_mp_context().Queue()
# Run the DP Coordinator process with rank 0 when in online DP mode.
# The coordinator is needed for:
# 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
@@ -913,7 +925,7 @@ def launch_core_engines(
log_stats=log_stats,
)
yield engine_actor_manager, coordinator, addresses
yield engine_actor_manager, coordinator, addresses, tensor_queue
return
if offline_mode:
@@ -975,11 +987,12 @@ def launch_core_engines(
local_engine_count=local_engine_count,
start_index=dp_rank,
local_start_index=local_start_index or 0,
tensor_queue=tensor_queue,
)
else:
local_engine_manager = None
yield local_engine_manager, coordinator, addresses
yield local_engine_manager, coordinator, addresses, tensor_queue
# Now wait for engines to start.
wait_for_engine_startup(