Add tensor IPC transfer mechanism for multimodal data (#32104)
Signed-off-by: Brandon Pelfrey <bpelfrey@nvidia.com> Signed-off-by: Brandon Pelfrey <brandonpelfrey@gmail.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -10,6 +10,7 @@ from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from multiprocessing import Process, connection
|
||||
from multiprocessing.process import BaseProcess
|
||||
from multiprocessing.queues import Queue
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -95,6 +96,7 @@ class CoreEngineProcManager:
|
||||
executor_class: type[Executor],
|
||||
log_stats: bool,
|
||||
client_handshake_address: str | None = None,
|
||||
tensor_queue: Queue | None = None,
|
||||
):
|
||||
context = get_mp_context()
|
||||
common_kwargs = {
|
||||
@@ -103,6 +105,7 @@ class CoreEngineProcManager:
|
||||
"handshake_address": handshake_address,
|
||||
"executor_class": executor_class,
|
||||
"log_stats": log_stats,
|
||||
"tensor_queue": tensor_queue,
|
||||
}
|
||||
|
||||
if client_handshake_address:
|
||||
@@ -864,6 +867,7 @@ def launch_core_engines(
|
||||
CoreEngineProcManager | CoreEngineActorManager | None,
|
||||
DPCoordinator | None,
|
||||
EngineZmqAddresses,
|
||||
Queue | None,
|
||||
]
|
||||
]:
|
||||
"""Launch engine and DP coordinator processes as needed."""
|
||||
@@ -878,6 +882,14 @@ def launch_core_engines(
|
||||
|
||||
offline_mode = local_start_index is not None
|
||||
|
||||
# Create a single tensor IPC queue for sharing multimodal tensors between
|
||||
# API servers and engine core. Returns a single queue since we only support
|
||||
# DP=1 for this data flow.
|
||||
tensor_queue: Queue | None = None
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
if multimodal_config is not None and multimodal_config.mm_tensor_ipc == "torch_shm":
|
||||
tensor_queue = get_mp_context().Queue()
|
||||
|
||||
# Run the DP Coordinator process with rank 0 when in online DP mode.
|
||||
# The coordinator is needed for:
|
||||
# 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
|
||||
@@ -913,7 +925,7 @@ def launch_core_engines(
|
||||
log_stats=log_stats,
|
||||
)
|
||||
|
||||
yield engine_actor_manager, coordinator, addresses
|
||||
yield engine_actor_manager, coordinator, addresses, tensor_queue
|
||||
return
|
||||
|
||||
if offline_mode:
|
||||
@@ -975,11 +987,12 @@ def launch_core_engines(
|
||||
local_engine_count=local_engine_count,
|
||||
start_index=dp_rank,
|
||||
local_start_index=local_start_index or 0,
|
||||
tensor_queue=tensor_queue,
|
||||
)
|
||||
else:
|
||||
local_engine_manager = None
|
||||
|
||||
yield local_engine_manager, coordinator, addresses
|
||||
yield local_engine_manager, coordinator, addresses, tensor_queue
|
||||
|
||||
# Now wait for engines to start.
|
||||
wait_for_engine_startup(
|
||||
|
||||
Reference in New Issue
Block a user