[Frontend] Chat template fallbacks for multimodal models (#17805)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-08 14:05:54 +08:00
parent 843b222723
commit 96722aa81d
18 changed files with 219 additions and 52 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -38,6 +38,10 @@ from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
+# yapf: disable
+from vllm.transformers_utils.chat_templates import (
+    get_chat_template_fallback_path)
+# yapf: enable
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer

@@ -325,11 +329,10 @@ def resolve_mistral_chat_template(
    return None

 def resolve_hf_chat_template(
+    model_config: ModelConfig,
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    chat_template: Optional[str],
    tools: Optional[list[dict[str, Any]]],
-    *,
-    trust_remote_code: bool,
 ) -> Optional[str]:
    # 1st priority: The given chat template
    if chat_template is not None:
@@ -342,7 +345,7 @@ def resolve_hf_chat_template(
                tokenizer.name_or_path,
                processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
                               ProcessorMixin),
-                trust_remote_code=trust_remote_code,
+                trust_remote_code=model_config.trust_remote_code,
            )
            if isinstance(processor, ProcessorMixin) and \
                processor.chat_template is not None:
@@ -358,22 +361,34 @@ def resolve_hf_chat_template(
        logger.debug("Failed to load AutoTokenizer chat template for %s",
                     tokenizer.name_or_path, exc_info=True)

-    return None
+    # 4th priority: Predefined fallbacks
+    path = get_chat_template_fallback_path(
+        model_type=model_config.hf_config.model_type,
+        tokenizer_name_or_path=model_config.tokenizer,
+    )
+    if path is not None:
+        logger.info("Loading chat template fallback for %s as there isn't one "
+                    "defined on HF Hub.", tokenizer.name_or_path)
+        chat_template = load_chat_template(path)
+    else:
+        logger.debug("There is no chat template fallback for %s",
+                     tokenizer.name_or_path)
+
+    return chat_template


 def _resolve_chat_template_content_format(
+    model_config: ModelConfig,
    chat_template: Optional[str],
    tools: Optional[list[dict[str, Any]]],
    given_format: ChatTemplateContentFormatOption,
    tokenizer: AnyTokenizer,
-    *,
-    trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
        hf_chat_template = resolve_hf_chat_template(
+            model_config,
            tokenizer,
            chat_template=chat_template,
-            trust_remote_code=trust_remote_code,
            tools=tools,
        )
    else:
@@ -413,19 +428,18 @@ def _log_chat_template_content_format(


 def resolve_chat_template_content_format(
+    model_config: ModelConfig,
    chat_template: Optional[str],
    tools: Optional[list[dict[str, Any]]],
    given_format: ChatTemplateContentFormatOption,
    tokenizer: AnyTokenizer,
-    *,
-    trust_remote_code: bool = False,
 ) -> _ChatTemplateContentFormat:
    detected_format = _resolve_chat_template_content_format(
+        model_config,
        chat_template,
        tools,
        given_format,
        tokenizer,
-        trust_remote_code=trust_remote_code,
    )

    _log_chat_template_content_format(
@@ -1177,20 +1191,20 @@ def parse_chat_messages_futures(


 def apply_hf_chat_template(
+    model_config: ModelConfig,
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    conversation: list[ConversationMessage],
    chat_template: Optional[str],
    tools: Optional[list[dict[str, Any]]],
    *,
-    trust_remote_code: bool = False,
    tokenize: bool = False,  # Different from HF's default
    **kwargs: Any,
 ) -> str:
    hf_chat_template = resolve_hf_chat_template(
+        model_config,
        tokenizer,
        chat_template=chat_template,
        tools=tools,
-        trust_remote_code=trust_remote_code,
    )

    if hf_chat_template is None:
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -726,11 +726,11 @@ class LLM:
        tokenizer = self.get_tokenizer(lora_request)
        model_config = self.llm_engine.get_model_config()
        resolved_content_format = resolve_chat_template_content_format(
+            model_config,
            chat_template,
            tools,
            chat_template_content_format,
            tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
        )

        _chat_template_kwargs: dict[str, Any] = dict(
@@ -762,8 +762,8 @@ class LLM:
                )
            else:
                prompt_str = apply_hf_chat_template(
+                    model_config,
                    tokenizer,
-                    trust_remote_code=model_config.trust_remote_code,
                    conversation=conversation,
                    **_chat_template_kwargs,
                )
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -937,10 +937,11 @@ async def init_app_state(
                chat_template=resolved_chat_template)
        else:
            hf_chat_template = resolve_hf_chat_template(
+                vllm_config.model_config,
                tokenizer,
                chat_template=None,
                tools=None,
-                trust_remote_code=model_config.trust_remote_code)
+            )

            if hf_chat_template != resolved_chat_template:
                logger.warning(
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -394,11 +394,11 @@ class OpenAIServing:
        model_config = self.model_config

        resolved_content_format = resolve_chat_template_content_format(
+            model_config,
            chat_template,
            tool_dicts,
            chat_template_content_format,
            tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
        )
        conversation, mm_data_future = parse_chat_messages_futures(
            messages,
@@ -425,8 +425,8 @@ class OpenAIServing:
            )
        else:
            request_prompt = apply_hf_chat_template(
+                model_config,
                tokenizer,
-                trust_remote_code=model_config.trust_remote_code,
                conversation=conversation,
                **_chat_template_kwargs,
            )