[V1] AsyncLLM Implementation (#9826)
Signed-off-by: Nick Hill <nickhill@us.ibm.com> Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
@@ -26,7 +26,6 @@ from typing_extensions import assert_never
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
from vllm.engine.multiprocessing.engine import run_mp_engine
|
||||
from vllm.engine.protocol import EngineClient
|
||||
@@ -61,6 +60,11 @@ from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
from vllm.v1.engine.async_llm import AsyncLLMEngine # type: ignore
|
||||
else:
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||
|
||||
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
||||
@@ -126,7 +130,8 @@ async def build_async_engine_client_from_engine_args(
|
||||
# Fall back
|
||||
# TODO: fill out feature matrix.
|
||||
if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
||||
or disable_frontend_multiprocessing):
|
||||
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
||||
|
||||
engine_config = engine_args.create_engine_config()
|
||||
uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
|
||||
"uses_ray", False)
|
||||
@@ -143,6 +148,8 @@ async def build_async_engine_client_from_engine_args(
|
||||
None, build_engine)
|
||||
|
||||
yield engine_client
|
||||
if hasattr(engine_client, "shutdown"):
|
||||
engine_client.shutdown()
|
||||
return
|
||||
|
||||
# Otherwise, use the multiprocessing AsyncLLMEngine.
|
||||
|
||||
Reference in New Issue
Block a user