[Frontend] Automatic detection of chat content format from AST (#9919)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -13,9 +13,11 @@ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
||||
TaskOption)
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
apply_hf_chat_template,
|
||||
apply_mistral_chat_template,
|
||||
parse_chat_messages)
|
||||
parse_chat_messages,
|
||||
resolve_chat_template_content_format)
|
||||
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
|
||||
from vllm.inputs.parse import parse_and_batch_prompt
|
||||
from vllm.logger import init_logger
|
||||
@@ -523,6 +525,7 @@ class LLM:
|
||||
use_tqdm: bool = True,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
chat_template: Optional[str] = None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
||||
add_generation_prompt: bool = True,
|
||||
continue_final_message: bool = False,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
@@ -539,9 +542,11 @@ class LLM:
|
||||
to the OpenAI API.
|
||||
|
||||
Args:
|
||||
messages: A list of conversations or a single conversation.
|
||||
- Each conversation is represented as a list of messages.
|
||||
- Each message is a dictionary with 'role' and 'content' keys.
|
||||
messages: A list of conversations or a single conversation.
|
||||
|
||||
- Each conversation is represented as a list of messages.
|
||||
- Each message is a dictionary with 'role' and 'content' keys.
|
||||
|
||||
sampling_params: The sampling parameters for text generation.
|
||||
If None, we use the default sampling parameters. When it
|
||||
is a single value, it is applied to every prompt. When it
|
||||
@@ -551,11 +556,19 @@ class LLM:
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
chat_template: The template to use for structuring the chat.
|
||||
If not provided, the model's default chat template will be used.
|
||||
chat_template_content_format: The format to render message content.
|
||||
|
||||
- "string" will render the content as a string.
|
||||
Example: ``"Who are you?"``
|
||||
- "openai" will render the content as a list of dictionaries,
|
||||
similar to OpenAI schema.
|
||||
Example: ``[{"type": "text", "text": "Who are you?"}]``
|
||||
|
||||
add_generation_prompt: If True, adds a generation template
|
||||
to each message.
|
||||
continue_final_message: If True, continues the final message in
|
||||
the conversation instead of starting a new one. Cannot be `True`
|
||||
if `add_generation_prompt` is also `True`.
|
||||
the conversation instead of starting a new one. Cannot be
|
||||
``True`` if ``add_generation_prompt`` is also ``True``.
|
||||
mm_processor_kwargs: Multimodal processor kwarg overrides for this
|
||||
chat request. Only used for offline requests.
|
||||
|
||||
@@ -576,17 +589,26 @@ class LLM:
|
||||
cast(List[ChatCompletionMessageParam], messages)
|
||||
]
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
model_config = self.llm_engine.get_model_config()
|
||||
resolved_content_format = resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
chat_template_content_format,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
prompts: List[Union[TokensPrompt, TextPrompt]] = []
|
||||
|
||||
for msgs in list_of_messages:
|
||||
tokenizer = self.get_tokenizer()
|
||||
model_config = self.llm_engine.get_model_config()
|
||||
|
||||
# NOTE: _parse_chat_message_content_parts() currently doesn't
|
||||
# handle mm_processor_kwargs, since there is no implementation in
|
||||
# the chat message parsing for it.
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
msgs, model_config, tokenizer)
|
||||
msgs,
|
||||
model_config,
|
||||
tokenizer,
|
||||
content_format=resolved_content_format,
|
||||
)
|
||||
|
||||
prompt_data: Union[str, List[int]]
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
@@ -737,7 +759,7 @@ class LLM:
|
||||
generation, if any.
|
||||
|
||||
Returns:
|
||||
A list of `EmbeddingRequestOutput` objects containing the
|
||||
A list of ``EmbeddingRequestOutput`` objects containing the
|
||||
generated embeddings in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
|
||||
Reference in New Issue
Block a user