[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2024-12-10 01:28:14 -05:00
committed by GitHub
parent bc192a2b09
commit 28b3a1c7e5
21 changed files with 732 additions and 145 deletions

View File

@@ -20,7 +20,7 @@ from vllm.v1.engine.async_stream import AsyncStream
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.gpu_executor import GPUExecutor
from vllm.v1.executor.abstract import Executor
logger = init_logger(__name__)
@@ -30,7 +30,7 @@ class AsyncLLM(EngineClient):
def __init__(
self,
vllm_config: VllmConfig,
executor_class: Type[GPUExecutor],
executor_class: Type[Executor],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
@@ -119,14 +119,24 @@ class AsyncLLM(EngineClient):
def shutdown(self):
"""Shutdown, cleaning up the background proc and IPC."""
self.engine_core.shutdown()
if engine_core := getattr(self, "engine_core", None):
engine_core.shutdown()
if handler := getattr(self, "output_handler", None):
handler.cancel()
@classmethod
def _get_executor_cls(cls, vllm_config: VllmConfig):
return GPUExecutor
distributed_executor_backend = (
vllm_config.parallel_config.distributed_executor_backend)
if distributed_executor_backend == "mp":
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
executor_class = MultiprocExecutor
else:
assert (distributed_executor_backend is None)
from vllm.v1.executor.uniproc_executor import UniprocExecutor
executor_class = UniprocExecutor
return executor_class
async def add_request(
self,