[V0 Deprecation] Remove AsyncLLMEngine (#25025)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -38,7 +38,6 @@ from typing_extensions import assert_never
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import (load_chat_template,
|
||||
resolve_hf_chat_template,
|
||||
@@ -201,50 +200,34 @@ async def build_async_engine_client_from_engine_args(
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
|
||||
# V1 AsyncLLM.
|
||||
if envs.VLLM_USE_V1:
|
||||
if disable_frontend_multiprocessing:
|
||||
logger.warning(
|
||||
"V1 is enabled, but got --disable-frontend-multiprocessing. "
|
||||
"To disable frontend multiprocessing, set VLLM_USE_V1=0.")
|
||||
assert envs.VLLM_USE_V1
|
||||
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
async_llm: Optional[AsyncLLM] = None
|
||||
client_count = client_config.pop(
|
||||
"client_count") if client_config else 1
|
||||
client_index = client_config.pop(
|
||||
"client_index") if client_config else 0
|
||||
try:
|
||||
async_llm = AsyncLLM.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
client_addresses=client_config,
|
||||
client_count=client_count,
|
||||
client_index=client_index)
|
||||
if disable_frontend_multiprocessing:
|
||||
logger.warning(
|
||||
"V1 is enabled, but got --disable-frontend-multiprocessing. "
|
||||
"To disable frontend multiprocessing, set VLLM_USE_V1=0.")
|
||||
|
||||
# Don't keep the dummy data in memory
|
||||
await async_llm.reset_mm_cache()
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
async_llm: Optional[AsyncLLM] = None
|
||||
client_count = client_config.pop("client_count") if client_config else 1
|
||||
client_index = client_config.pop("client_index") if client_config else 0
|
||||
try:
|
||||
async_llm = AsyncLLM.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
client_addresses=client_config,
|
||||
client_count=client_count,
|
||||
client_index=client_index)
|
||||
|
||||
yield async_llm
|
||||
finally:
|
||||
if async_llm:
|
||||
async_llm.shutdown()
|
||||
# Don't keep the dummy data in memory
|
||||
await async_llm.reset_mm_cache()
|
||||
|
||||
# V0 AsyncLLM.
|
||||
else:
|
||||
|
||||
engine_client: Optional[EngineClient] = None
|
||||
try:
|
||||
engine_client = AsyncLLMEngine.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats)
|
||||
yield engine_client
|
||||
finally:
|
||||
if engine_client and hasattr(engine_client, "shutdown"):
|
||||
engine_client.shutdown()
|
||||
yield async_llm
|
||||
finally:
|
||||
if async_llm:
|
||||
async_llm.shutdown()
|
||||
|
||||
|
||||
async def validate_json_request(raw_request: Request):
|
||||
|
||||
Reference in New Issue
Block a user