[Frontend] Move async logic outside of constructor (#4674)
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
import asyncio
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pydantic import Field
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest, ErrorResponse,
|
||||
@@ -29,13 +28,24 @@ class LoRAModulePath:
|
||||
|
||||
class OpenAIServing:
|
||||
|
||||
def __init__(self,
|
||||
engine: AsyncLLMEngine,
|
||||
def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
|
||||
served_model_names: List[str],
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
await_post_init: Optional[Awaitable[Any]] = None):
|
||||
lora_modules: Optional[List[LoRAModulePath]]):
|
||||
super().__init__()
|
||||
|
||||
self.engine = engine
|
||||
self.max_model_len = model_config.max_model_len
|
||||
|
||||
# A separate tokenizer to map token IDs to strings.
|
||||
self.tokenizer = get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
tokenizer_revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
truncation_side="left")
|
||||
|
||||
self.served_model_names = served_model_names
|
||||
|
||||
if lora_modules is None:
|
||||
self.lora_requests = []
|
||||
else:
|
||||
@@ -47,38 +57,6 @@ class OpenAIServing:
|
||||
) for i, lora in enumerate(lora_modules, start=1)
|
||||
]
|
||||
|
||||
self.max_model_len = 0
|
||||
# Lazy initialized
|
||||
self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
||||
|
||||
try:
|
||||
event_loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
event_loop = None
|
||||
|
||||
if event_loop is not None and event_loop.is_running():
|
||||
# If the current is instanced by Ray Serve,
|
||||
# there is already a running event loop
|
||||
event_loop.create_task(self._post_init(await_post_init))
|
||||
else:
|
||||
# When using single vLLM without engine_use_ray
|
||||
asyncio.run(self._post_init(await_post_init))
|
||||
|
||||
async def _post_init(self, await_post_init):
|
||||
engine_model_config = await self.engine.get_model_config()
|
||||
self.max_model_len = engine_model_config.max_model_len
|
||||
|
||||
# A separate tokenizer to map token IDs to strings.
|
||||
self.tokenizer = get_tokenizer(
|
||||
engine_model_config.tokenizer,
|
||||
tokenizer_mode=engine_model_config.tokenizer_mode,
|
||||
tokenizer_revision=engine_model_config.tokenizer_revision,
|
||||
trust_remote_code=engine_model_config.trust_remote_code,
|
||||
truncation_side="left")
|
||||
|
||||
if await_post_init is not None:
|
||||
await await_post_init
|
||||
|
||||
async def show_available_models(self) -> ModelList:
|
||||
"""Show available models. Right now we only have one model."""
|
||||
model_cards = [
|
||||
|
||||
Reference in New Issue
Block a user