Add hf.py patch to force string content format for GLM models

- Tool response content was being dropped because vLLM detected 'openai' content format incorrectly for GLM templates - Added _is_glm_model() detection to force 'string' format - Updated Dockerfile to include hf.py patch - Added debug tests for tool visibility
2026-04-09 05:20:47 +00:00
parent 8d5da5750d
commit aa4f667ab8
5 changed files with 1206 additions and 6 deletions
--- a/5
+++ b/5
@@ -1,5 +1,10 @@
 ARG BASE_IMAGE=vllm/vllm-openai:glm51-cu130
 FROM ${BASE_IMAGE}

+# Patch tool parser for GLM regex fix
 COPY glm4_moe_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/glm4_moe_tool_parser.py
 COPY utils.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/utils.py
+
+# Patch hf renderer to force string content format for GLM models
+# This fixes the issue where tool response content is dropped
+COPY vllm_patches/hf.py /usr/local/lib/python3.12/dist-packages/vllm/renderers/hf.py
--- a/README.md
+++ b/README.md
@@ -8,7 +8,11 @@ Patches vLLM's GLM-4/GLM-5.1 tool parser to fix multiple issues with tool call h

 **Symptom:** When the model makes a tool call and receives a response, it would act as if the response was empty ("The function returned no output") even though valid content was provided.

-**Root Cause:** The `func_detail_regex` required a newline between the function name and first argument tag, but GLM-5.1's chat template does NOT include that newline. The regex silently failed to match, tool call extraction failed, and somewhere in that failure path the tool response content got lost.
+**Root Cause:** Two bugs working together:
+
+1. **Tool parser regex mismatch** (`glm4_moe_tool_parser.py`): The `func_detail_regex` required a newline between the function name and first argument tag, but GLM-5.1's chat template doesn't include that newline. The regex silently failed to match.
+
+2. **Content format detection wrong** (`vllm/renderers/hf.py`): vLLM detected "openai" content format because the GLM template has `{% for tr in m.content %}` for tool responses. But the template then checks `m.content is string` which is False for OpenAI format arrays, causing content to be dropped.

 **Model output format (no newline after name):**
 ```
@@ -25,10 +29,8 @@ r"\[TOOL_CALL_START\]([^\n]*)\n(.*)\[TOOL_CALL_END\]"  # Requires \n after name
 r"\[TOOL_CALL_START\]\s*([\w.\-]+)\s*((?:\[ARG_KEY\].*)?)\s*\[TOOL_CALL_END\]"
 ```

-The fix:
- Uses `\s*` instead of mandatory `\n`
- Makes the arguments group optional for zero-argument calls
- Accepts word chars, dots, and hyphens in function names
+**Content format fix:**
+Added `_is_glm_model()` detection to force "string" content format for GLM models, bypassing the incorrect auto-detection.

 ### Issue 2: Zero-Argument Tool Calls Crash

@@ -44,8 +46,9 @@ Both paths now use the same robust extraction helpers for consistency.

 | File | Description |
 |------|-------------|
-| `glm4_moe_tool_parser.py` | Fixed tool parser |
+| `glm4_moe_tool_parser.py` | Fixed tool parser (regex fix) |
 | `utils.py` | Utility functions for partial JSON/tag handling |
+| `vllm_patches/hf.py` | Patched renderer (content format fix) |
 | `Dockerfile` | Overlays patched files onto base image |
 | `Jenkinsfile` | CI/CD pipeline for building and pushing |
 | `tests/` | Test suite for tool call validation |
--- a/tests/test_tool_debug.py
+++ b/tests/test_tool_debug.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+Debug test to see what prompt the model actually receives.
+"""
+
+import httpx
+import json
+
+API_BASE = "https://api.vultrinference.com/v1"
+API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ"
+MODEL = "zai-org/GLM-5.1-FP8"
+
+
+def test_with_echo():
+    """
+    Test with echo=True to see the prompt tokens.
+    """
+    
+    messages = [
+        {"role": "user", "content": "Call the test function"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "test_func", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "VALUE_42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "test_func",
+            "description": "A test function",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        # Try to get prompt logprobs which might show us the prompt
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100,
+                "logprobs": True,
+                "top_logprobs": 1,
+                "echo": True  # Return prompt tokens
+            }
+        )
+        
+        result = response.json()
+        
+        print("Full response:")
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+
+
+def test_tool_only_message():
+    """
+    Test if a tool-only message (no tools param) works.
+    This is what worked in the previous test.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "calc", "arguments": "{}"}
+            }],
+            "content": None
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "The answer is 42"
+        }
+    ]
+    
+    # NO tools param - this worked before
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                # NO tools param
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nNo tools param - Response: {content}")
+            print(f"Contains 42: {'42' in content}")
+        else:
+            print(f"\nNo tools param - Error: {result}")
+
+
+def test_with_tools_param():
+    """
+    Test WITH tools param - this is what fails.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "calc", "arguments": "{}"}
+            }],
+            "content": None
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "The answer is 42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "calc",
+            "description": "Calculator",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,  # WITH tools param
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        content = result["choices"][0]["message"]["content"]
+        print(f"\nWith tools param - Response: {content}")
+        print(f"Contains 42: {'42' in content}")
+
+
+def test_without_assistant_tool_calls():
+    """
+    Test if the issue is the assistant message with tool_calls.
+    What if we just send user -> tool response?
+    """
+    
+    messages = [
+        {"role": "user", "content": "The calculator returned this result"},
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "VALUE_IS_42"
+        }
+    ]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nNo assistant tool_calls - Response: {content}")
+            print(f"Contains 42: {'42' in content}")
+        else:
+            print(f"\nError: {result}")
+
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Debugging tool response visibility")
+    print("=" * 60)
+    
+    test_tool_only_message()
+    test_with_tools_param()
+    test_without_assistant_tool_calls()
--- a/tests/test_tool_visibility.py
+++ b/tests/test_tool_visibility.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Minimal test - is the tool response content being passed to the model?
+"""
+
+import httpx
+import json
+
+API_BASE = "https://api.vultrinference.com/v1"
+API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ"
+MODEL = "zai-org/GLM-5.1-FP8"
+
+
+def test_direct_prompt():
+    """
+    If we could send a direct prompt, what would it look like?
+    
+    GLM-5.1 expects tool responses in <observations> tags:
+    <observations>{"result": "42"}</observations>
+    
+    Let's test if the model can see content in that format.
+    """
+    
+    # Simulate what the prompt SHOULD look like after chat template
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I'll call the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123", 
+            "content": "UNIQUE_MARKER_42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+def test_fake_tool_response_in_user_message():
+    """
+    Test: What if we put the tool response in a user message instead?
+    This bypasses the role="tool" handling entirely.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I called the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        # Instead of role="tool", use user message
+        {"role": "user", "content": "The function returned: UNIQUE_MARKER_42"}
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nUser message hack - Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+def test_tool_response_as_observation_format():
+    """
+    Test: What if we format the tool response in the GLM expected format?
+    GLM expects: <observations>content</observations>
+    """
+    
+    # Try putting the observations tag in the content
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I called the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "<observations>UNIQUE_MARKER_42</observations>"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nWith <observations> tags - Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+if __name__ == "__main__":
+    print("Testing tool response visibility")
+    print("=" * 60)
+    
+    test_direct_prompt()
+    test_fake_tool_response_in_user_message()
+    test_tool_response_as_observation_format()
--- a/vllm_patches/hf.py
+++ b/vllm_patches/hf.py
@@ -0,0 +1,771 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+import itertools
+from collections import defaultdict, deque
+from collections.abc import Set
+from functools import lru_cache
+from typing import Any, Literal, cast, overload
+
+import jinja2
+import jinja2.ext
+import jinja2.meta
+import jinja2.nodes
+import jinja2.parser
+import jinja2.sandbox
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormat,
+    ChatTemplateContentFormatOption,
+    ChatTemplateResolutionError,
+    ConversationMessage,
+    load_chat_template,
+    parse_chat_messages,
+    parse_chat_messages_async,
+)
+from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
+from vllm.logger import init_logger
+from vllm.tokenizers.hf import HfTokenizer
+from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.async_utils import make_async
+from vllm.utils.func_utils import supports_kw
+
+from .base import BaseRenderer
+from .inputs import DictPrompt
+from .inputs.preprocess import parse_dec_only_prompt
+from .params import ChatParams
+
+logger = init_logger(__name__)
+
+
+_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]()
+"""
+Used in `_try_get_processor_chat_template` to avoid calling
+`cached_get_processor` again if the processor fails to be loaded.
+
+This is needed because `lru_cache` does not cache when an exception happens.
+"""
+
+
+def _try_get_processor_chat_template(
+    tokenizer: HfTokenizer,
+    *,
+    trust_remote_code: bool,
+) -> str | None:
+    cache_key = (tokenizer.name_or_path, trust_remote_code)
+    if cache_key in _PROCESSOR_CHAT_TEMPLATES:
+        return _PROCESSOR_CHAT_TEMPLATES[cache_key]
+
+    from transformers import (
+        PreTrainedTokenizer,
+        PreTrainedTokenizerFast,
+        ProcessorMixin,
+    )
+
+    try:
+        processor = cached_get_processor(
+            tokenizer.name_or_path,
+            processor_cls=(
+                PreTrainedTokenizer,
+                PreTrainedTokenizerFast,
+                ProcessorMixin,
+            ),
+            trust_remote_code=trust_remote_code,
+        )
+        if (
+            isinstance(processor, ProcessorMixin)
+            and hasattr(processor, "chat_template")
+            and (chat_template := processor.chat_template) is not None
+        ):
+            _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
+            return chat_template
+    except Exception:
+        logger.debug(
+            "Failed to load AutoProcessor chat template for %s",
+            tokenizer.name_or_path,
+            exc_info=True,
+        )
+
+    _PROCESSOR_CHAT_TEMPLATES[cache_key] = None
+    return None
+
+
+def resolve_chat_template(
+    tokenizer: HfTokenizer,
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
+    *,
+    model_config: "ModelConfig",
+) -> str | None:
+    # 1st priority: The given chat template
+    if chat_template is not None:
+        # Resolve template names (e.g. "tool_use") to actual Jinja content
+        # so that downstream kwargs detection can parse template variables.
+        return tokenizer.get_chat_template(chat_template, tools=tools)
+
+    # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
+    if tools is None:
+        chat_template = _try_get_processor_chat_template(
+            tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        if chat_template is not None:
+            return chat_template
+
+    # 3rd priority: AutoTokenizer chat template
+    try:
+        return tokenizer.get_chat_template(chat_template, tools=tools)
+    except Exception:
+        logger.debug(
+            "Failed to load AutoTokenizer chat template for %s",
+            tokenizer.name_or_path,
+            exc_info=True,
+        )
+
+    # 4th priority: Predefined fallbacks
+    path = get_chat_template_fallback_path(
+        model_type=model_config.hf_config.model_type,
+        tokenizer_name_or_path=tokenizer.name_or_path,
+    )
+    if path is not None:
+        logger.info_once(
+            "Loading chat template fallback for %s as there isn't one "
+            "defined on HF Hub.",
+            tokenizer.name_or_path,
+        )
+        chat_template = load_chat_template(path)
+    else:
+        logger.debug_once(
+            "There is no chat template fallback for %s", tokenizer.name_or_path
+        )
+
+    return chat_template
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str | None = None,
+) -> bool:
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+
+
+def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
+    # Global variable that is implicitly defined at the root
+    yield root, varname
+
+    # Iterative BFS
+    related_varnames = deque([varname])
+    while related_varnames:
+        related_varname = related_varnames.popleft()
+
+        for assign_ast in root.find_all(jinja2.nodes.Assign):
+            lhs = assign_ast.target
+            rhs = assign_ast.node
+
+            if _is_var_or_elems_access(rhs, related_varname):
+                assert isinstance(lhs, jinja2.nodes.Name)
+                yield assign_ast, lhs.name
+
+                # Avoid infinite looping for self-assignment
+                if lhs.name != related_varname:
+                    related_varnames.append(lhs.name)
+
+
+# NOTE: The proper way to handle this is to build a CFG so that we can handle
+# the scope in which each variable is defined, but that is too complicated
+def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
+    messages_varnames = [
+        varname for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
+    ]
+
+    # Search for {%- for message in messages -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in messages_varnames:
+            if _is_var_or_elems_access(loop_iter, varname):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
+    message_varnames = [
+        varname for _, varname in _iter_nodes_assign_messages_item(root)
+    ]
+
+    # Search for {%- for content in message['content'] -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in message_varnames:
+            if _is_var_or_elems_access(loop_iter, varname, "content"):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _try_extract_ast(chat_template: str) -> jinja2.nodes.Template | None:
+    import transformers.utils.chat_template_utils as hf_chat_utils
+
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception:
+        logger.exception("Error when compiling Jinja template")
+        return None
+
+
+@lru_cache(maxsize=32)
+def _detect_content_format(
+    chat_template: str,
+    *,
+    default: ChatTemplateContentFormat,
+) -> ChatTemplateContentFormat:
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return default
+
+    try:
+        next(_iter_nodes_assign_content_item(jinja_ast))
+    except StopIteration:
+        return "string"
+    except Exception:
+        logger.exception("Error when parsing AST of Jinja template")
+        return default
+    else:
+        return "openai"
+
+
+def _is_glm_model(tokenizer: HfTokenizer, model_config: "ModelConfig") -> bool:
+    """Check if this is a GLM model that requires string content format.
+    
+    GLM models (GLM-4, GLM-4.5, GLM-5.x) have a chat template that incorrectly
+    triggers "openai" content format detection because they iterate over
+    m.content for tool responses. However, the template expects string content
+    for tool messages (checking `m.content is string`).
+    
+    This detection ensures we force "string" format for GLM models.
+    """
+    # Check tokenizer name/path for GLM indicators
+    name_or_path = tokenizer.name_or_path.lower()
+    glm_indicators = ["glm-4", "glm-5", "glm4", "glm5", "zai-org/glm"]
+    if any(ind in name_or_path for ind in glm_indicators):
+        return True
+    
+    # Check model type in config
+    if hasattr(model_config, "hf_config") and hasattr(model_config.hf_config, "model_type"):
+        model_type = model_config.hf_config.model_type.lower()
+        if "glm" in model_type:
+            return True
+    
+    return False
+
+
+def _resolve_chat_template_content_format(
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
+    tokenizer: HfTokenizer,
+    *,
+    model_config: "ModelConfig",
+) -> ChatTemplateContentFormat:
+    # GLM models require "string" content format for tool responses to work
+    # The template has `{% for tr in m.content %}` which triggers "openai"
+    # detection, but then checks `m.content is string` which fails for arrays.
+    if _is_glm_model(tokenizer, model_config):
+        logger.debug(
+            "Forcing 'string' content format for GLM model: %s",
+            tokenizer.name_or_path,
+        )
+        return "string"
+    
+    resolved_chat_template = resolve_chat_template(
+        tokenizer,
+        chat_template=chat_template,
+        tools=tools,
+        model_config=model_config,
+    )
+
+    jinja_text = (
+        resolved_chat_template
+        if isinstance(resolved_chat_template, str)
+        else load_chat_template(chat_template, is_literal=True)
+    )
+
+    detected_format = (
+        "string"
+        if jinja_text is None
+        else _detect_content_format(jinja_text, default="string")
+    )
+
+    return detected_format
+
+
+@lru_cache
+def _log_chat_template_content_format(
+    chat_template: str | None,  # For caching purposes
+    given_format: ChatTemplateContentFormatOption,
+    detected_format: ChatTemplateContentFormatOption,
+):
+    logger.info(
+        "Detected the chat template content format to be '%s'. "
+        "You can set `--chat-template-content-format` to override this.",
+        detected_format,
+    )
+
+    if given_format != "auto" and given_format != detected_format:
+        logger.warning(
+            "You specified `--chat-template-content-format %s` "
+            "which is different from the detected format '%s'. "
+            "If our automatic detection is incorrect, please consider "
+            "opening a GitHub issue so that we can improve it: "
+            "https://github.com/vllm-project/vllm/issues/new/choose",
+            given_format,
+            detected_format,
+        )
+
+
+def resolve_chat_template_content_format(
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: HfTokenizer,
+    *,
+    model_config: "ModelConfig",
+) -> ChatTemplateContentFormat:
+    if given_format != "auto":
+        return given_format
+
+    detected_format = _resolve_chat_template_content_format(
+        chat_template,
+        tools,
+        tokenizer,
+        model_config=model_config,
+    )
+
+    _log_chat_template_content_format(
+        chat_template,
+        given_format=given_format,
+        detected_format=detected_format,
+    )
+
+    return detected_format
+
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/utils/chat_template_utils.py#L398-L412
+# only preserve the parse function used to resolve chat template kwargs
+class AssistantTracker(jinja2.ext.Extension):
+    tags = {"generation"}
+
+    def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.Node:
+        lineno = next(parser.stream).lineno
+        body = parser.parse_statements(("name:endgeneration",), drop_needle=True)
+        call = self.call_method("_generation_support")
+        call_block = jinja2.nodes.CallBlock(call, [], [], body)
+        return call_block.set_lineno(lineno)
+
+
+def _resolve_chat_template_kwargs(chat_template: str) -> Set[str]:
+    env = jinja2.sandbox.ImmutableSandboxedEnvironment(
+        trim_blocks=True,
+        lstrip_blocks=True,
+        extensions=[AssistantTracker, jinja2.ext.loopcontrols],
+    )
+    parsed_content = env.parse(chat_template)
+    template_vars = jinja2.meta.find_undeclared_variables(parsed_content)
+    return template_vars
+
+
+_cached_resolve_chat_template_kwargs = lru_cache(_resolve_chat_template_kwargs)
+
+
+@lru_cache
+def _get_hf_base_chat_template_params() -> frozenset[str]:
+    from transformers import PreTrainedTokenizer
+
+    # Get standard parameters from HuggingFace's base tokenizer class.
+    # This dynamically extracts parameters from PreTrainedTokenizer's
+    # apply_chat_template method, ensuring compatibility with tokenizers
+    # that use **kwargs to receive standard parameters.
+
+    # Read signature from HF's base class - the single source of truth
+    base_sig = inspect.signature(PreTrainedTokenizer.apply_chat_template)
+
+    # Exclude VAR_KEYWORD (**kwargs) and VAR_POSITIONAL (*args) placeholders
+    return frozenset(
+        p.name
+        for p in base_sig.parameters.values()
+        if p.kind
+        not in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+    )
+
+
+def resolve_chat_template_kwargs(
+    tokenizer: HfTokenizer,
+    chat_template: str,
+    chat_template_kwargs: dict[str, Any],
+    raise_on_unexpected: bool = True,
+) -> dict[str, Any]:
+    # We exclude chat_template from kwargs here, because
+    # chat template has been already resolved at this stage
+    unexpected_vars = {"chat_template", "tokenize"}
+    if raise_on_unexpected and (
+        unexpected_in_kwargs := unexpected_vars & chat_template_kwargs.keys()
+    ):
+        raise ValueError(
+            "Found unexpected chat template kwargs from request: "
+            f"{unexpected_in_kwargs}"
+        )
+
+    fn_kw = {
+        k
+        for k in chat_template_kwargs
+        if supports_kw(tokenizer.apply_chat_template, k, allow_var_kwargs=False)
+    }
+    template_vars = _cached_resolve_chat_template_kwargs(chat_template)
+
+    # Allow standard HF parameters even if tokenizer uses **kwargs to receive them
+    hf_base_params = _get_hf_base_chat_template_params()
+
+    accept_vars = (fn_kw | template_vars | hf_base_params) - unexpected_vars
+    return {k: v for k, v in chat_template_kwargs.items() if k in accept_vars}
+
+
+@overload
+def safe_apply_chat_template(
+    model_config: "ModelConfig",
+    tokenizer: HfTokenizer,
+    conversation: list[ConversationMessage],
+    *,
+    tools: list[dict[str, Any]] | None = ...,
+    chat_template: str | None = ...,
+    tokenize: Literal[True] = ...,
+    **kwargs,
+) -> list[int]: ...
+@overload
+def safe_apply_chat_template(
+    model_config: "ModelConfig",
+    tokenizer: HfTokenizer,
+    conversation: list[ConversationMessage],
+    *,
+    tools: list[dict[str, Any]] | None = ...,
+    chat_template: str | None = ...,
+    tokenize: Literal[False] = ...,
+    **kwargs,
+) -> str: ...
+def safe_apply_chat_template(
+    model_config: "ModelConfig",
+    tokenizer: HfTokenizer,
+    conversation: list[ConversationMessage],
+    *,
+    tools: list[dict[str, Any]] | None = None,
+    chat_template: str | None = None,
+    tokenize: bool = True,
+    **kwargs,
+) -> str | list[int]:
+    chat_template = resolve_chat_template(
+        tokenizer,
+        chat_template=chat_template,
+        tools=tools,
+        model_config=model_config,
+    )
+    if chat_template is None:
+        raise ChatTemplateResolutionError(
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        )
+
+    resolved_kwargs = resolve_chat_template_kwargs(
+        tokenizer=tokenizer,
+        chat_template=chat_template,
+        chat_template_kwargs=kwargs,
+    )
+
+    try:
+        return tokenizer.apply_chat_template(
+            conversation=conversation,  # type: ignore[arg-type]
+            tools=tools,  # type: ignore[arg-type]
+            chat_template=chat_template,
+            tokenize=tokenize,
+            **resolved_kwargs,
+        )
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `transformers` while applying chat template"
+        )
+        raise ValueError(str(e)) from e
+
+
+def rebuild_mm_uuids_from_mm_data(
+    mm_uuids: MultiModalUUIDDict,
+    mm_data: MultiModalDataDict,
+) -> MultiModalUUIDDict:
+    """Rebuild mm_uuids after vision_chunk processing.
+
+    When videos are split into chunks, the original UUIDs need to be updated
+    to reflect the new UUIDs generated for each chunk.
+
+    Args:
+        mm_uuids: Original UUIDs dictionary
+        mm_data: Processed multimodal data with vision_chunk items
+
+    Returns:
+        Updated UUIDs dictionary with chunk UUIDs
+    """
+    vision_chunks = mm_data.get("vision_chunk")
+    if vision_chunks is None:
+        return mm_uuids
+
+    assert all(isinstance(item, dict) for item in vision_chunks), (
+        "Expected all vision_chunk items to be dicts"
+    )
+    vision_chunks = cast(list[dict[str, Any]], vision_chunks)
+    vision_chunk_uuids = [
+        uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
+    ]
+
+    if vision_chunk_uuids:
+        mm_uuids = dict(mm_uuids)
+        mm_uuids["vision_chunk"] = vision_chunk_uuids
+
+    return mm_uuids
+
+
+def build_video_prompts_from_mm_data(
+    mm_data: MultiModalDataDict,
+) -> list[str]:
+    """Build video prompts from vision_chunk data.
+
+    Collects prompts from video chunks and groups them by video_idx.
+
+    Args:
+        mm_data: Processed multimodal data with vision_chunk items
+
+    Returns:
+        List of video prompts, one per video.
+    """
+    vision_chunks = mm_data.get("vision_chunk")
+    if vision_chunks is None:
+        return []
+
+    # Group chunks by video_idx
+    video_prompts_dict: dict[int, list[str]] = defaultdict(list)
+
+    for item in vision_chunks:
+        # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
+        assert isinstance(item, dict)
+        if item.get("type") == "video_chunk":
+            video_idx = item.get("video_idx", 0)
+            prompt = item.get("prompt", "")
+            video_prompts_dict[video_idx].append(prompt)
+
+    # Build prompts in video order
+    video_prompts = [
+        "".join(video_prompts_dict[video_idx])
+        for video_idx in sorted(video_prompts_dict.keys())
+    ]
+
+    return video_prompts
+
+
+def replace_vision_chunk_video_placeholder(
+    prompt_raw: str | list[int],
+    mm_data: MultiModalDataDict,
+    video_placeholder: str | None,
+) -> str | list[int]:
+    # get video placeholder, replace it with runtime video-chunk prompts
+    if video_placeholder and isinstance(prompt_raw, str):
+        video_prompts = build_video_prompts_from_mm_data(mm_data)
+
+        # replace in order
+        prompt_raw_parts = prompt_raw.split(video_placeholder)
+        if len(prompt_raw_parts) == len(video_prompts) + 1:
+            prompt_raw = "".join(
+                itertools.chain.from_iterable(zip(prompt_raw_parts, video_prompts))
+            )
+            prompt_raw += prompt_raw_parts[-1]
+        else:
+            logger.warning(
+                "Number of video placeholders (%d) does not match "
+                "number of videos (%d) in the request.",
+                len(prompt_raw_parts) - 1,
+                len(video_prompts),
+            )
+    return prompt_raw
+
+
+class HfRenderer(BaseRenderer[HfTokenizer]):
+    def __init__(
+        self,
+        config: VllmConfig,
+        tokenizer: HfTokenizer | None,
+    ) -> None:
+        super().__init__(config, tokenizer)
+
+        self.use_unified_vision_chunk = getattr(
+            config.model_config.hf_config, "use_unified_vision_chunk", False
+        )
+
+        self._apply_chat_template_async = make_async(
+            safe_apply_chat_template, executor=self._executor
+        )
+
+    def render_messages(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        params: ChatParams,
+    ) -> tuple[list[ConversationMessage], DictPrompt]:
+        model_config = self.model_config
+        tokenizer = self.get_tokenizer()
+
+        conversation, mm_data, mm_uuids = parse_chat_messages(
+            messages,
+            model_config,
+            content_format=resolve_chat_template_content_format(
+                chat_template=params.chat_template,
+                tools=params.chat_template_kwargs.get("tools"),
+                given_format=params.chat_template_content_format,
+                tokenizer=tokenizer,
+                model_config=model_config,
+            ),
+            media_io_kwargs=params.media_io_kwargs,
+            mm_processor_kwargs=params.mm_processor_kwargs,
+        )
+
+        prompt_raw = safe_apply_chat_template(
+            model_config,
+            tokenizer,
+            conversation,
+            **params.get_apply_chat_template_kwargs(),
+        )
+
+        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
+        # model which uses unified vision chunks for both images and videos.
+        if (
+            self.use_unified_vision_chunk
+            and mm_uuids is not None
+            and mm_data is not None
+        ):
+            mm_uuids = rebuild_mm_uuids_from_mm_data(mm_uuids, mm_data)
+
+            # get video placeholder, replace it with runtime video-chunk prompts
+            video_placeholder = getattr(
+                model_config.hf_config, "video_placeholder", None
+            )
+            prompt_raw = cast(
+                list[int],
+                replace_vision_chunk_video_placeholder(
+                    prompt_raw,
+                    mm_data,
+                    video_placeholder,
+                ),
+            )
+
+        prompt = parse_dec_only_prompt(prompt_raw)
+        if mm_data is not None:
+            prompt["multi_modal_data"] = mm_data
+        if mm_uuids is not None:
+            prompt["multi_modal_uuids"] = mm_uuids
+
+        return conversation, prompt
+
+    async def render_messages_async(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        params: ChatParams,
+    ) -> tuple[list[ConversationMessage], DictPrompt]:
+        model_config = self.model_config
+        tokenizer = self.get_tokenizer()
+
+        conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+            messages,
+            model_config,
+            content_format=resolve_chat_template_content_format(
+                chat_template=params.chat_template,
+                tools=params.chat_template_kwargs.get("tools"),
+                given_format=params.chat_template_content_format,
+                tokenizer=tokenizer,
+                model_config=model_config,
+            ),
+            media_io_kwargs=params.media_io_kwargs,
+            mm_processor_kwargs=params.mm_processor_kwargs,
+        )
+
+        prompt_raw = await self._apply_chat_template_async(
+            model_config,
+            tokenizer,
+            conversation,
+            **params.get_apply_chat_template_kwargs(),
+        )
+
+        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
+        # model which uses unified vision chunks for both images and videos.
+        if (
+            self.use_unified_vision_chunk
+            and mm_uuids is not None
+            and mm_data is not None
+        ):
+            # get video placeholder, replace it with runtime video-chunk prompts
+            video_placeholder = getattr(
+                model_config.hf_config, "video_placeholder", None
+            )
+            prompt_raw = cast(
+                list[int],
+                replace_vision_chunk_video_placeholder(
+                    prompt_raw,
+                    mm_data,
+                    video_placeholder,
+                ),
+            )
+
+        prompt = parse_dec_only_prompt(prompt_raw)
+        if mm_data is not None:
+            prompt["multi_modal_data"] = mm_data
+        if mm_uuids is not None:
+            prompt["multi_modal_uuids"] = mm_uuids
+
+        return conversation, prompt