[Frontend] Refactor prompt processing (#4028)
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -18,6 +18,7 @@ from starlette.routing import Mount
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
@@ -244,24 +245,48 @@ def run_server(args, llm_engine=None):
|
||||
# When using single vLLM without engine_use_ray
|
||||
model_config = asyncio.run(engine.get_model_config())
|
||||
|
||||
if args.disable_log_requests:
|
||||
request_logger = None
|
||||
else:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
|
||||
global openai_serving_chat
|
||||
global openai_serving_completion
|
||||
global openai_serving_embedding
|
||||
global openai_serving_tokenization
|
||||
|
||||
openai_serving_chat = OpenAIServingChat(engine, model_config,
|
||||
served_model_names,
|
||||
args.response_role,
|
||||
args.lora_modules,
|
||||
args.chat_template)
|
||||
openai_serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
model_config,
|
||||
served_model_names,
|
||||
args.response_role,
|
||||
lora_modules=args.lora_modules,
|
||||
prompt_adapters=args.prompt_adapters,
|
||||
request_logger=request_logger,
|
||||
chat_template=args.chat_template,
|
||||
)
|
||||
openai_serving_completion = OpenAIServingCompletion(
|
||||
engine, model_config, served_model_names, args.lora_modules,
|
||||
args.prompt_adapters)
|
||||
openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
|
||||
served_model_names)
|
||||
engine,
|
||||
model_config,
|
||||
served_model_names,
|
||||
lora_modules=args.lora_modules,
|
||||
prompt_adapters=args.prompt_adapters,
|
||||
request_logger=request_logger,
|
||||
)
|
||||
openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine,
|
||||
model_config,
|
||||
served_model_names,
|
||||
request_logger=request_logger,
|
||||
)
|
||||
openai_serving_tokenization = OpenAIServingTokenization(
|
||||
engine, model_config, served_model_names, args.lora_modules,
|
||||
args.chat_template)
|
||||
engine,
|
||||
model_config,
|
||||
served_model_names,
|
||||
lora_modules=args.lora_modules,
|
||||
request_logger=request_logger,
|
||||
chat_template=args.chat_template,
|
||||
)
|
||||
app.root_path = args.root_path
|
||||
|
||||
logger.info("Available routes are:")
|
||||
|
||||
Reference in New Issue
Block a user