Add hf.py patch to force string content format for GLM models

- Tool response content was being dropped because vLLM detected 'openai' content format incorrectly for GLM templates - Added _is_glm_model() detection to force 'string' format - Updated Dockerfile to include hf.py patch - Added debug tests for tool visibility
2026-04-09 05:20:47 +00:00
parent 8d5da5750d
commit aa4f667ab8
5 changed files with 1206 additions and 6 deletions
--- a/tests/test_tool_debug.py
+++ b/tests/test_tool_debug.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+Debug test to see what prompt the model actually receives.
+"""
+
+import httpx
+import json
+
+API_BASE = "https://api.vultrinference.com/v1"
+API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ"
+MODEL = "zai-org/GLM-5.1-FP8"
+
+
+def test_with_echo():
+    """
+    Test with echo=True to see the prompt tokens.
+    """
+    
+    messages = [
+        {"role": "user", "content": "Call the test function"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "test_func", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "VALUE_42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "test_func",
+            "description": "A test function",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        # Try to get prompt logprobs which might show us the prompt
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100,
+                "logprobs": True,
+                "top_logprobs": 1,
+                "echo": True  # Return prompt tokens
+            }
+        )
+        
+        result = response.json()
+        
+        print("Full response:")
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+
+
+def test_tool_only_message():
+    """
+    Test if a tool-only message (no tools param) works.
+    This is what worked in the previous test.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "calc", "arguments": "{}"}
+            }],
+            "content": None
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "The answer is 42"
+        }
+    ]
+    
+    # NO tools param - this worked before
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                # NO tools param
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nNo tools param - Response: {content}")
+            print(f"Contains 42: {'42' in content}")
+        else:
+            print(f"\nNo tools param - Error: {result}")
+
+
+def test_with_tools_param():
+    """
+    Test WITH tools param - this is what fails.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {
+            "role": "assistant",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "calc", "arguments": "{}"}
+            }],
+            "content": None
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "The answer is 42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "calc",
+            "description": "Calculator",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,  # WITH tools param
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        content = result["choices"][0]["message"]["content"]
+        print(f"\nWith tools param - Response: {content}")
+        print(f"Contains 42: {'42' in content}")
+
+
+def test_without_assistant_tool_calls():
+    """
+    Test if the issue is the assistant message with tool_calls.
+    What if we just send user -> tool response?
+    """
+    
+    messages = [
+        {"role": "user", "content": "The calculator returned this result"},
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "VALUE_IS_42"
+        }
+    ]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nNo assistant tool_calls - Response: {content}")
+            print(f"Contains 42: {'42' in content}")
+        else:
+            print(f"\nError: {result}")
+
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Debugging tool response visibility")
+    print("=" * 60)
+    
+    test_tool_only_message()
+    test_with_tools_param()
+    test_without_assistant_tool_calls()
--- a/tests/test_tool_visibility.py
+++ b/tests/test_tool_visibility.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Minimal test - is the tool response content being passed to the model?
+"""
+
+import httpx
+import json
+
+API_BASE = "https://api.vultrinference.com/v1"
+API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ"
+MODEL = "zai-org/GLM-5.1-FP8"
+
+
+def test_direct_prompt():
+    """
+    If we could send a direct prompt, what would it look like?
+    
+    GLM-5.1 expects tool responses in <observations> tags:
+    <observations>{"result": "42"}</observations>
+    
+    Let's test if the model can see content in that format.
+    """
+    
+    # Simulate what the prompt SHOULD look like after chat template
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I'll call the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123", 
+            "content": "UNIQUE_MARKER_42"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+def test_fake_tool_response_in_user_message():
+    """
+    Test: What if we put the tool response in a user message instead?
+    This bypasses the role="tool" handling entirely.
+    """
+    
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I called the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        # Instead of role="tool", use user message
+        {"role": "user", "content": "The function returned: UNIQUE_MARKER_42"}
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nUser message hack - Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+def test_tool_response_as_observation_format():
+    """
+    Test: What if we format the tool response in the GLM expected format?
+    GLM expects: <observations>content</observations>
+    """
+    
+    # Try putting the observations tag in the content
+    messages = [
+        {"role": "user", "content": "What did the function return?"},
+        {
+            "role": "assistant", 
+            "content": "I called the function.",
+            "tool_calls": [{
+                "id": "call_123",
+                "type": "function",
+                "function": {"name": "get_value", "arguments": "{}"}
+            }]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "call_123",
+            "content": "<observations>UNIQUE_MARKER_42</observations>"
+        }
+    ]
+    
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_value",
+            "description": "Get a value",
+            "parameters": {"type": "object", "properties": {}}
+        }
+    }]
+    
+    with httpx.Client(timeout=60.0) as client:
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 100
+            }
+        )
+        
+        result = response.json()
+        
+        if "choices" in result:
+            content = result["choices"][0]["message"]["content"]
+            print(f"\nWith <observations> tags - Model response: {content}")
+            print(f"Contains UNIQUE_MARKER_42: {'UNIQUE_MARKER_42' in content}")
+        else:
+            print(f"Error: {result}")
+
+
+if __name__ == "__main__":
+    print("Testing tool response visibility")
+    print("=" * 60)
+    
+    test_direct_prompt()
+    test_fake_tool_response_in_user_message()
+    test_tool_response_as_observation_format()