From 139e617ed0535e259f426077529b350f5513431e Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 9 Apr 2026 06:21:04 +0000 Subject: [PATCH] Clean up README with full bug analysis for ZAI --- README.md | 158 ++++++++++++++---------- tests/test_tool_debug.py | 221 ---------------------------------- tests/test_tool_visibility.py | 200 ------------------------------ 3 files changed, 93 insertions(+), 486 deletions(-) delete mode 100644 tests/test_tool_debug.py delete mode 100644 tests/test_tool_visibility.py diff --git a/README.md b/README.md index 311fba8..f1f0716 100644 --- a/README.md +++ b/README.md @@ -1,46 +1,90 @@ -# vLLM GLM Tool Parser Patch +# vLLM GLM-5.x Tool Calling Patches -Patches vLLM's GLM-4/GLM-5.1 tool parser to fix multiple issues with tool call handling. +Fixes two critical bugs that prevent GLM models from working correctly with OpenAI-compatible tool calling in vLLM. -## Issues Fixed +## Summary -### Issue 1: Tool Response Content Ignored (CRITICAL) +GLM-5.x models would either crash or silently drop tool response content when using the OpenAI chat completions API with tools. Two separate bugs were responsible: -**Symptom:** When the model makes a tool call and receives a response, it would act as if the response was empty ("The function returned no output") even though valid content was provided. +1. **Tool parser regex mismatch** — Parser expected newline between function name and arguments, but GLM's template does not include one +2. **Content format detection failure** — vLLM auto-detected "openai" format incorrectly, causing tool response content to be dropped -**Root Cause:** Two bugs working together: +--- -1. **Tool parser regex mismatch** (`glm4_moe_tool_parser.py`): The `func_detail_regex` required a newline between the function name and first argument tag, but GLM-5.1's chat template doesn't include that newline. The regex silently failed to match. +## Bug #1: Tool Parser Regex Mismatch -2. **Content format detection wrong** (`vllm/renderers/hf.py`): vLLM detected "openai" content format because the GLM template has `{% for tr in m.content %}` for tool responses. But the template then checks `m.content is string` which is False for OpenAI format arrays, causing content to be dropped. +### Problem -**Model output format (no newline after name):** -``` -[TOOL_CALL_START]function_name[ARG_KEY]value[ARG_END]...[TOOL_CALL_END] -``` +The `func_detail_regex` in `glm4_moe_tool_parser.py` required a literal newline between the function name and the first argument tag. + +GLM-5.x chat template outputs tool calls without that newline - the function name is immediately followed by the first argument tag. The regex would fail to match, causing tool call extraction to fail silently. + +### Fix + +Changed the regex to use `\\s*` (optional whitespace) instead of mandatory `\\n`, and made the arguments group optional for zero-argument calls: -**Old regex (broken):** ```python -r"\[TOOL_CALL_START\]([^\n]*)\n(.*)\[TOOL_CALL_END\]" # Requires \n after name +# Before +r"\[TOOL_START\]([^\n]*)\n(.*)\[TOOL_END\]" + +# After +r"\[TOOL_START\]\s*([\w.\-]+)\s*((?:\[ARG_KEY\].*)?)\s*\[TOOL_END\]" ``` -**Fixed regex:** +Also fixed `tc_args_raw` to default to empty string, preventing crashes on zero-argument tool calls. + +**File:** `glm4_moe_tool_parser.py` + +--- + +## Bug #2: Content Format Detection Failure + +### Problem + +vLLM's `_detect_content_format()` function analyzes Jinja templates to determine whether message content should be formatted as strings or OpenAI-style arrays. + +For GLM-5.x, the template contains a loop `{% for tr in m.content %}` for handling tool responses with multiple results. vLLM saw this loop and detected "openai" format, converting tool message content to: + +```json +[{"type": "text", "text": "the actual content"}] +``` + +However, the GLM template's first branch checks `{% if m.content is string %}` before using that loop. Since arrays are not strings, the template took the wrong branch and the content was lost. + +The model would respond: *"The function returned no output"* even though valid content was provided. + +### Root Cause + +The template has two branches for tool messages: + +```jinja +{%- if m.content is string %} + {{ '' + m.content + '' }} +{%- else %} + {% for tr in m.content %} + ... +{% endif %} +``` + +vLLM's detection saw the `for` loop and chose "openai" format. But the `is string` check failed for arrays, and the `else` branch expected objects with `.name` properties that `{"type": "text"}` objects don't have. + +### Fix + +Added `_is_glm_model()` detection function to `vllm/renderers/hf.py` that forces "string" content format for GLM models, bypassing the incorrect auto-detection: + ```python -r"\[TOOL_CALL_START\]\s*([\w.\-]+)\s*((?:\[ARG_KEY\].*)?)\s*\[TOOL_CALL_END\]" +def _is_glm_model(tokenizer: HfTokenizer, model_config: "ModelConfig") -> bool: + """Check if this is a GLM model that requires string content format.""" + name_or_path = tokenizer.name_or_path.lower() + glm_indicators = ["glm-4", "glm-5", "glm4", "glm5", "zai-org/glm"] + return any(ind in name_or_path for ind in glm_indicators) ``` -**Content format fix:** -Added `_is_glm_model()` detection to force "string" content format for GLM models, bypassing the incorrect auto-detection. +Called in `_resolve_chat_template_content_format()` before auto-detection. -### Issue 2: Zero-Argument Tool Calls Crash +**File:** `vllm_patches/hf.py` -**Symptom:** `TypeError: 'NoneType' object is not iterable` when tool has no arguments. - -**Fix:** The `tc_args_raw` is now defaulted to empty string: `tc_args_raw = tc_detail.group(2) or ""` - -### Issue 3: Streaming Path vs Non-Streaming Path Inconsistency - -Both paths now use the same robust extraction helpers for consistency. +--- ## Files @@ -49,57 +93,41 @@ Both paths now use the same robust extraction helpers for consistency. | `glm4_moe_tool_parser.py` | Fixed tool parser (regex fix) | | `utils.py` | Utility functions for partial JSON/tag handling | | `vllm_patches/hf.py` | Patched renderer (content format fix) | -| `Dockerfile` | Overlays patched files onto base image | -| `Jenkinsfile` | CI/CD pipeline for building and pushing | -| `tests/` | Test suite for tool call validation | +| `Dockerfile` | Overlays patched files onto base vLLM image | -## Testing - -### Requirements - -```bash -pip install httpx regex -``` - -### Running Tests - -```bash -export VLLM_API_BASE="https://api.vultrinference.com/v1" -export VLLM_API_KEY="your-api-key" -export VLLM_MODEL="zai-org/GLM-5.1-FP8" - -python tests/test_tool_diagnosis.py -``` - -### Test Cases - -| Test | Description | -|------|-------------| -| `test_simple_tool_response` | Verifies model can see tool response content | -| `test_without_tools_param` | Tests behavior without tools param in follow-up | -| `test_different_content_formats` | String vs array content formats | +--- ## Deployment -### Jenkins Pipeline +### Docker Build ```bash -curl -X POST "https://jenkins.sweetapi.com/job/vllm-glm-build/buildWithParameters" \ - -u "admin:TOKEN" \ - -d "IMAGE_TAG=latest" +docker build -t your-registry/vllm-glm51-patched:latest . +docker push your-registry/vllm-glm51-patched:latest ``` -### Manual Build +### Kubernetes -```bash -docker build -t atl.vultrcr.com/vllm/vllm-glm51-patched:latest . -docker push atl.vultrcr.com/vllm/vllm-glm51-patched:latest +Update your deployment to use the patched image and ensure these vLLM args: + +```yaml +extraArgs: + - "--tool-call-parser=glm47" + - "--enable-auto-tool-choice" ``` -### Images +--- -- Base: `vllm/vllm-openai:glm51-cu130` -- Output: `atl.vultrcr.com/vllm/vllm-glm51-patched:` +## Verification + +Tool response content is now properly passed to the model: + +``` +Model response: The test function was called successfully! It returned the value **42**. +PASS: Model referenced the tool result (42) +``` + +--- ## Related diff --git a/tests/test_tool_debug.py b/tests/test_tool_debug.py deleted file mode 100644 index 1ae018c..0000000 --- a/tests/test_tool_debug.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 -""" -Debug test to see what prompt the model actually receives. -""" - -import httpx -import json - -API_BASE = "https://api.vultrinference.com/v1" -API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ" -MODEL = "zai-org/GLM-5.1-FP8" - - -def test_with_echo(): - """ - Test with echo=True to see the prompt tokens. - """ - - messages = [ - {"role": "user", "content": "Call the test function"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "test_func", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "VALUE_42" - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "test_func", - "description": "A test function", - "parameters": {"type": "object", "properties": {}} - } - }] - - with httpx.Client(timeout=60.0) as client: - # Try to get prompt logprobs which might show us the prompt - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 100, - "logprobs": True, - "top_logprobs": 1, - "echo": True # Return prompt tokens - } - ) - - result = response.json() - - print("Full response:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - -def test_tool_only_message(): - """ - Test if a tool-only message (no tools param) works. - This is what worked in the previous test. - """ - - messages = [ - {"role": "user", "content": "What is 2+2?"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "calc", "arguments": "{}"} - }], - "content": None - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "The answer is 42" - } - ] - - # NO tools param - this worked before - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - # NO tools param - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - if "choices" in result: - content = result["choices"][0]["message"]["content"] - print(f"\nNo tools param - Response: {content}") - print(f"Contains 42: {'42' in content}") - else: - print(f"\nNo tools param - Error: {result}") - - -def test_with_tools_param(): - """ - Test WITH tools param - this is what fails. - """ - - messages = [ - {"role": "user", "content": "What is 2+2?"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "calc", "arguments": "{}"} - }], - "content": None - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "The answer is 42" - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "calc", - "description": "Calculator", - "parameters": {"type": "object", "properties": {}} - } - }] - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, # WITH tools param - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - content = result["choices"][0]["message"]["content"] - print(f"\nWith tools param - Response: {content}") - print(f"Contains 42: {'42' in content}") - - -def test_without_assistant_tool_calls(): - """ - Test if the issue is the assistant message with tool_calls. - What if we just send user -> tool response? - """ - - messages = [ - {"role": "user", "content": "The calculator returned this result"}, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "VALUE_IS_42" - } - ] - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - if "choices" in result: - content = result["choices"][0]["message"]["content"] - print(f"\nNo assistant tool_calls - Response: {content}") - print(f"Contains 42: {'42' in content}") - else: - print(f"\nError: {result}") - - -if __name__ == "__main__": - print("=" * 60) - print("Debugging tool response visibility") - print("=" * 60) - - test_tool_only_message() - test_with_tools_param() - test_without_assistant_tool_calls() diff --git a/tests/test_tool_visibility.py b/tests/test_tool_visibility.py deleted file mode 100644 index 540455b..0000000 --- a/tests/test_tool_visibility.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 -""" -Minimal test - is the tool response content being passed to the model? -""" - -import httpx -import json - -API_BASE = "https://api.vultrinference.com/v1" -API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ" -MODEL = "zai-org/GLM-5.1-FP8" - - -def test_direct_prompt(): - """ - If we could send a direct prompt, what would it look like? - - GLM-5.1 expects tool responses in tags: - {"result": "42"} - - Let's test if the model can see content in that format. - """ - - # Simulate what the prompt SHOULD look like after chat template - messages = [ - {"role": "user", "content": "What did the function return?"}, - { - "role": "assistant", - "content": "I'll call the function.", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "get_value", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "UNIQUE_MARKER_42" - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "get_value", - "description": "Get a value", - "parameters": {"type": "object", "properties": {}} - } - }] - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - - if "choices" in result: - content = result["choices"][0]["message"]["content"] - print(f"Model response: {content}") - print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}") - else: - print(f"Error: {result}") - - -def test_fake_tool_response_in_user_message(): - """ - Test: What if we put the tool response in a user message instead? - This bypasses the role="tool" handling entirely. - """ - - messages = [ - {"role": "user", "content": "What did the function return?"}, - { - "role": "assistant", - "content": "I called the function.", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "get_value", "arguments": "{}"} - }] - }, - # Instead of role="tool", use user message - {"role": "user", "content": "The function returned: UNIQUE_MARKER_42"} - ] - - tools = [{ - "type": "function", - "function": { - "name": "get_value", - "description": "Get a value", - "parameters": {"type": "object", "properties": {}} - } - }] - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - - if "choices" in result: - content = result["choices"][0]["message"]["content"] - print(f"\nUser message hack - Model response: {content}") - print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}") - else: - print(f"Error: {result}") - - -def test_tool_response_as_observation_format(): - """ - Test: What if we format the tool response in the GLM expected format? - GLM expects: content - """ - - # Try putting the observations tag in the content - messages = [ - {"role": "user", "content": "What did the function return?"}, - { - "role": "assistant", - "content": "I called the function.", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "get_value", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "UNIQUE_MARKER_42" - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "get_value", - "description": "Get a value", - "parameters": {"type": "object", "properties": {}} - } - }] - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 100 - } - ) - - result = response.json() - - if "choices" in result: - content = result["choices"][0]["message"]["content"] - print(f"\nWith tags - Model response: {content}") - print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}") - else: - print(f"Error: {result}") - - -if __name__ == "__main__": - print("Testing tool response visibility") - print("=" * 60) - - test_direct_prompt() - test_fake_tool_response_in_user_message() - test_tool_response_as_observation_format()