[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2024-12-10 01:28:14 -05:00
committed by GitHub
parent bc192a2b09
commit 28b3a1c7e5
21 changed files with 732 additions and 145 deletions

View File

@@ -1,12 +1,12 @@
import multiprocessing
import pickle
import queue
import signal
import threading
import time
from contextlib import contextmanager
from multiprocessing.process import BaseProcess
from multiprocessing.sharedctypes import Synchronized
from typing import Any, Iterator, List, Tuple, Type, Union
from typing import List, Tuple, Type, Union
import zmq
import zmq.asyncio
@@ -20,9 +20,10 @@ from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
EngineCoreProfile, EngineCoreRequest,
EngineCoreRequestType)
from vllm.v1.engine.mm_input_mapper import MMInputMapper
from vllm.v1.executor.gpu_executor import GPUExecutor
from vllm.v1.executor.abstract import Executor
from vllm.v1.request import Request, RequestStatus
from vllm.v1.serial_utils import PickleEncoder
from vllm.v1.utils import make_zmq_socket
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
@@ -38,7 +39,7 @@ class EngineCore:
def __init__(
self,
vllm_config: VllmConfig,
executor_class: Type[GPUExecutor],
executor_class: Type[Executor],
usage_context: UsageContext,
):
assert vllm_config.model_config.task != "embedding"
@@ -80,7 +81,7 @@ class EngineCore:
num_gpu_blocks = num_gpu_blocks_override
num_cpu_blocks = 0
self.model_executor.initialize_cache(num_gpu_blocks)
self.model_executor.initialize(num_gpu_blocks)
elapsed = time.time() - start
logger.info(("init engine (profile, create kv cache, "
"warmup model) took %.2f seconds"), elapsed)
@@ -112,8 +113,11 @@ class EngineCore:
scheduler_output, output)
return engine_core_outputs
def shutdown(self):
self.model_executor.shutdown()
def profile(self, is_start=True):
self.model_executor.worker.profile(is_start)
self.model_executor.profile(is_start)
class EngineCoreProc(EngineCore):
@@ -124,7 +128,7 @@ class EngineCoreProc(EngineCore):
def __init__(
self,
vllm_config: VllmConfig,
executor_class: Type[GPUExecutor],
executor_class: Type[Executor],
usage_context: UsageContext,
input_path: str,
output_path: str,
@@ -151,32 +155,9 @@ class EngineCoreProc(EngineCore):
daemon=True).start()
# Send Readiness signal to EngineClient.
with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket:
with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
ready_socket.send_string(EngineCoreProc.READY_STR)
@contextmanager
def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]:
"""Context manager for use """
ctx = zmq.Context()
try:
socket = ctx.socket(type)
if type == zmq.constants.PULL:
socket.connect(path)
elif type == zmq.constants.PUSH:
socket.bind(path)
else:
raise ValueError(f"Unknown Socket Type: {type}")
yield socket
except KeyboardInterrupt:
logger.debug("EngineCore had Keyboard Interrupt.")
finally:
ctx.destroy(linger=0)
@staticmethod
def wait_for_startup(
proc: BaseProcess,
@@ -209,7 +190,7 @@ class EngineCoreProc(EngineCore):
@staticmethod
def make_engine_core_process(
vllm_config: VllmConfig,
executor_class: Type[GPUExecutor],
executor_class: Type[Executor],
usage_context: UsageContext,
input_path: str,
output_path: str,
@@ -244,17 +225,38 @@ class EngineCoreProc(EngineCore):
def run_engine_core(*args, **kwargs):
"""Launch EngineCore busy loop in background process."""
# Signal handler used for graceful termination.
# SystemExit exception is only raised once to allow this and worker
# processes to terminate without error
shutdown_requested = False
def signal_handler(signum, frame):
nonlocal shutdown_requested
if not shutdown_requested:
shutdown_requested = True
raise SystemExit()
# Either SIGTERM or SIGINT will terminate the engine_core
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
engine_core = None
try:
engine_core = EngineCoreProc(*args, **kwargs)
engine_core.run_busy_loop()
except KeyboardInterrupt:
except SystemExit:
logger.debug("EngineCore interrupted.")
except BaseException as e:
logger.exception(e)
raise e
finally:
if engine_core is not None:
engine_core.shutdown()
engine_core = None
def run_busy_loop(self):
"""Core busy loop of the EngineCore."""
@@ -272,6 +274,8 @@ class EngineCoreProc(EngineCore):
logger.debug("EngineCore busy loop waiting.")
if self.should_shutdown:
return
except BaseException:
raise
# 2) Handle any new client requests (Abort or Add).
while not self.input_queue.empty():
@@ -321,7 +325,7 @@ class EngineCoreProc(EngineCore):
decoder_add_req = PickleEncoder()
decoder_abort_req = PickleEncoder()
with self.make_socket(input_path, zmq.constants.PULL) as socket:
with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
while True:
# (RequestType, RequestData)
type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -349,7 +353,7 @@ class EngineCoreProc(EngineCore):
# Reuse send buffer.
buffer = bytearray()
with self.make_socket(output_path, zmq.constants.PUSH) as socket:
with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
while True:
engine_core_outputs = self.output_queue.get()
outputs = EngineCoreOutputs(outputs=engine_core_outputs)