[Frontend] Factor out chat message parsing (#7055)
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
import time
|
||||
from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, List,
|
||||
Optional)
|
||||
from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Union
|
||||
|
||||
@@ -11,7 +10,7 @@ from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import AsyncEngineClient
|
||||
from vllm.entrypoints.chat_utils import (ConversationMessage,
|
||||
load_chat_template,
|
||||
parse_chat_message_content)
|
||||
parse_chat_messages)
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionLogProb, ChatCompletionLogProbs,
|
||||
@@ -92,15 +91,8 @@ class OpenAIServingChat(OpenAIServing):
|
||||
tokenizer = await self.async_engine_client.get_tokenizer(
|
||||
lora_request)
|
||||
|
||||
conversation: List[ConversationMessage] = []
|
||||
mm_futures: List[Awaitable[MultiModalDataDict]] = []
|
||||
|
||||
for msg in request.messages:
|
||||
chat_parsed_result = parse_chat_message_content(
|
||||
msg, model_config, tokenizer)
|
||||
|
||||
conversation.extend(chat_parsed_result.messages)
|
||||
mm_futures.extend(chat_parsed_result.mm_futures)
|
||||
conversation, mm_futures = parse_chat_messages(
|
||||
request.messages, model_config, tokenizer)
|
||||
|
||||
tool_dicts = None if request.tools is None else [
|
||||
tool.model_dump() for tool in request.tools
|
||||
@@ -115,6 +107,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
chat_template=request.chat_template or self.chat_template,
|
||||
**(request.chat_template_kwargs or {}),
|
||||
)
|
||||
assert isinstance(prompt, str)
|
||||
except Exception as e:
|
||||
logger.error("Error in applying chat template from request: %s", e)
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
Reference in New Issue
Block a user