[Frontend] Automatic detection of chat content format from AST (#9919)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2024-11-16 13:35:40 +08:00
committed by GitHub
parent 4f168f69a3
commit 32e46e000f
16 changed files with 788 additions and 350 deletions

View File

@@ -13,9 +13,11 @@ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
apply_hf_chat_template,
apply_mistral_chat_template,
parse_chat_messages)
parse_chat_messages,
resolve_chat_template_content_format)
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
from vllm.inputs.parse import parse_and_batch_prompt
from vllm.logger import init_logger
@@ -523,6 +525,7 @@ class LLM:
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
add_generation_prompt: bool = True,
continue_final_message: bool = False,
tools: Optional[List[Dict[str, Any]]] = None,
@@ -539,9 +542,11 @@ class LLM:
to the OpenAI API.
Args:
messages: A list of conversations or a single conversation.
- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.
messages: A list of conversations or a single conversation.
- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.
sampling_params: The sampling parameters for text generation.
If None, we use the default sampling parameters. When it
is a single value, it is applied to every prompt. When it
@@ -551,11 +556,19 @@ class LLM:
lora_request: LoRA request to use for generation, if any.
chat_template: The template to use for structuring the chat.
If not provided, the model's default chat template will be used.
chat_template_content_format: The format to render message content.
- "string" will render the content as a string.
Example: ``"Who are you?"``
- "openai" will render the content as a list of dictionaries,
similar to OpenAI schema.
Example: ``[{"type": "text", "text": "Who are you?"}]``
add_generation_prompt: If True, adds a generation template
to each message.
continue_final_message: If True, continues the final message in
the conversation instead of starting a new one. Cannot be `True`
if `add_generation_prompt` is also `True`.
the conversation instead of starting a new one. Cannot be
``True`` if ``add_generation_prompt`` is also ``True``.
mm_processor_kwargs: Multimodal processor kwarg overrides for this
chat request. Only used for offline requests.
@@ -576,17 +589,26 @@ class LLM:
cast(List[ChatCompletionMessageParam], messages)
]
tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()
resolved_content_format = resolve_chat_template_content_format(
chat_template,
chat_template_content_format,
tokenizer,
)
prompts: List[Union[TokensPrompt, TextPrompt]] = []
for msgs in list_of_messages:
tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()
# NOTE: _parse_chat_message_content_parts() currently doesn't
# handle mm_processor_kwargs, since there is no implementation in
# the chat message parsing for it.
conversation, mm_data = parse_chat_messages(
msgs, model_config, tokenizer)
msgs,
model_config,
tokenizer,
content_format=resolved_content_format,
)
prompt_data: Union[str, List[int]]
if isinstance(tokenizer, MistralTokenizer):
@@ -737,7 +759,7 @@ class LLM:
generation, if any.
Returns:
A list of `EmbeddingRequestOutput` objects containing the
A list of ``EmbeddingRequestOutput`` objects containing the
generated embeddings in the same order as the input prompts.
Note: