SmolLM3-3B tool call fix: template bugs found and patched

2026-04-10 03:33:16 +00:00
commit 1186c9d816
9 changed files with 1720 additions and 0 deletions
--- a/test_tool_response.py
+++ b/test_tool_response.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+"""
+Test for tool call response handling in GLM-5.1.
+
+Tests the multi-turn flow:
+1. Send a prompt that triggers a tool call
+2. Send back the tool result
+3. Verify the model can see and use the tool response
+
+This reproduces the issue where tool responses appear blank to the model.
+"""
+
+import os
+import json
+import httpx
+from datetime import datetime
+
+
+API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
+API_KEY = os.environ.get("VLLM_API_KEY", "none")
+MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
+
+
+def timestamp():
+    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
+
+
+def test_tool_call_response_flow(streaming: bool = True):
+    """
+    Test the full tool call -> response -> follow-up flow.
+    
+    This simulates:
+    1. User asks for weather
+    2. Model calls get_weather tool
+    3. We send back the weather data
+    4. Model should see and use that data
+    """
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g. 'New York, NY'"
+                        }
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]
+    
+    # Initial request that should trigger a tool call
+    messages = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Tokyo right now?"
+        }
+    ]
+    
+    mode = "STREAMING" if streaming else "NON-STREAMING"
+    print(f"\n{'='*60}")
+    print(f"TEST: Tool call response flow ({mode})")
+    print(f"API: {API_BASE}")
+    print(f"Model: {MODEL}")
+    print(f"{'='*60}\n")
+    
+    with httpx.Client(timeout=120.0) as client:
+        # Step 1: Send initial request, expect tool call
+        print(f"[{timestamp()}] Step 1: Sending initial request...")
+        
+        if streaming:
+            tool_calls = []
+            tool_call_id = None
+            tool_call_name = None
+            accumulated_args = ""
+            
+            with client.stream(
+                "POST",
+                f"{API_BASE}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": MODEL,
+                    "messages": messages,
+                    "tools": tools,
+                    "tool_choice": "auto",
+                    "stream": True,
+                    "max_tokens": 512
+                }
+            ) as response:
+                print(f"[{timestamp()}] Response status: {response.status_code}")
+                
+                for line in response.iter_lines():
+                    if not line or line == "data: [DONE]":
+                        continue
+                    
+                    if line.startswith("data: "):
+                        try:
+                            chunk = json.loads(line[6:])
+                            if chunk.get("choices"):
+                                delta = chunk["choices"][0].get("delta", {})
+                                
+                                if delta.get("tool_calls"):
+                                    for tc in delta["tool_calls"]:
+                                        idx = tc.get("index", 0)
+                                        
+                                        if tc.get("id"):
+                                            tool_call_id = tc["id"]
+                                        
+                                        if tc.get("function", {}).get("name"):
+                                            tool_call_name = tc["function"]["name"]
+                                            print(f"[{timestamp()}] Tool call: {tool_call_name}")
+                                        
+                                        if tc.get("function", {}).get("arguments"):
+                                            accumulated_args += tc["function"]["arguments"]
+                                
+                                if delta.get("content"):
+                                    print(f"[{timestamp()}] Content: {delta['content'][:100]}")
+                                    
+                        except json.JSONDecodeError as e:
+                            print(f"[{timestamp()}] JSON error: {e}")
+            
+            if tool_call_name:
+                tool_calls.append({
+                    "id": tool_call_id or "call_0",
+                    "type": "function",
+                    "function": {
+                        "name": tool_call_name,
+                        "arguments": accumulated_args
+                    }
+                })
+        else:
+            # Non-streaming
+            response = client.post(
+                f"{API_BASE}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": MODEL,
+                    "messages": messages,
+                    "tools": tools,
+                    "tool_choice": "auto",
+                    "stream": False,
+                    "max_tokens": 512
+                }
+            )
+            
+            result = response.json()
+            print(f"[{timestamp()}] Response status: {response.status_code}")
+            
+            tool_calls = []
+            if result.get("choices"):
+                message = result["choices"][0].get("message", {})
+                if message.get("tool_calls"):
+                    tool_calls = message["tool_calls"]
+                    for tc in tool_calls:
+                        print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
+                        print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
+        
+        # Check if we got a tool call
+        if not tool_calls:
+            print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
+            return {"success": False, "reason": "no_tool_call"}
+        
+        # Step 2: Parse tool call and prepare response
+        tc = tool_calls[0]
+        tc_id = tc.get("id", "call_0")
+        tc_name = tc["function"]["name"]
+        tc_args = json.loads(tc["function"]["arguments"])
+        
+        print(f"\n[{timestamp()}] Step 2: Tool call received")
+        print(f"  Name: {tc_name}")
+        print(f"  Args: {tc_args}")
+        
+        # Simulate tool execution
+        tool_result = {
+            "location": tc_args.get("location", "Unknown"),
+            "temperature": "22°C",
+            "condition": "Partly cloudy",
+            "humidity": "65%",
+            "wind": "15 km/h NE"
+        }
+        
+        # Step 3: Send the tool response back
+        messages.append({
+            "role": "assistant",
+            "tool_calls": tool_calls
+        })
+        messages.append({
+            "role": "tool",
+            "tool_call_id": tc_id,
+            "content": json.dumps(tool_result)
+        })
+        
+        print(f"\n[{timestamp()}] Step 3: Sending tool response...")
+        print(f"  Tool call ID: {tc_id}")
+        print(f"  Tool result: {json.dumps(tool_result, indent=2)}")
+        
+        # Step 4: Get the model's follow-up response
+        if streaming:
+            final_response = ""
+            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
+            
+            with client.stream(
+                "POST",
+                f"{API_BASE}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": MODEL,
+                    "messages": messages,
+                    "tools": tools,
+                    "stream": True,
+                    "max_tokens": 512
+                }
+            ) as response:
+                for line in response.iter_lines():
+                    if not line or line == "data: [DONE]":
+                        continue
+                    
+                    if line.startswith("data: "):
+                        try:
+                            chunk = json.loads(line[6:])
+                            if chunk.get("choices"):
+                                delta = chunk["choices"][0].get("delta", {})
+                                if delta.get("content"):
+                                    content = delta["content"]
+                                    final_response += content
+                                    print(f"[{timestamp()}] Content: {content}", end="", flush=True)
+                        except json.JSONDecodeError:
+                            pass
+            
+            print()  # newline after streaming output
+        else:
+            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
+            
+            response = client.post(
+                f"{API_BASE}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": MODEL,
+                    "messages": messages,
+                    "tools": tools,
+                    "stream": False,
+                    "max_tokens": 512
+                }
+            )
+            
+            result = response.json()
+            final_response = ""
+            if result.get("choices"):
+                final_response = result["choices"][0].get("message", {}).get("content", "")
+        
+        print(f"\n[{timestamp()}] Final response:\n{final_response}")
+        
+        # Check if the model used the tool data
+        success = True
+        issues = []
+        
+        # The response should mention the weather data
+        if "22" not in final_response and "22°C" not in final_response:
+            issues.append("Temperature (22°C) not mentioned in response")
+            success = False
+        
+        if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
+            issues.append("Condition (Partly cloudy) not mentioned in response")
+            success = False
+        
+        # Check for signs the model didn't see the data
+        blank_indicators = [
+            "i don't have",
+            "i cannot access",
+            "i'm unable to",
+            "i am unable to",
+            "don't have access",
+            "don't have real-time",
+            "cannot provide real-time"
+        ]
+        
+        for indicator in blank_indicators:
+            if indicator in final_response.lower():
+                issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
+                success = False
+                break
+        
+        print(f"\n{'='*60}")
+        if success:
+            print("✓ PASS: Model correctly used tool response data")
+        else:
+            print("✗ FAIL: Model did not use tool response correctly")
+            for issue in issues:
+                print(f"  - {issue}")
+        print(f"{'='*60}\n")
+        
+        return {
+            "success": success,
+            "issues": issues,
+            "final_response": final_response
+        }
+
+
+def test_tool_response_with_debug_info():
+    """
+    Test with detailed logging to capture exactly what the model sees.
+    """
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_time",
+                "description": "Get the current time",
+                "parameters": {
+                    "type": "object",
+                    "properties": {},
+                    "required": []
+                }
+            }
+        }
+    ]
+    
+    print(f"\n{'='*60}")
+    print(f"TEST: Tool response with debug info (non-streaming)")
+    print(f"{'='*60}\n")
+    
+    messages = [
+        {"role": "user", "content": "What time is it?"}
+    ]
+    
+    with httpx.Client(timeout=120.0) as client:
+        # Get tool call
+        print(f"[{timestamp()}] Sending initial request...")
+        response = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "tool_choice": "auto",
+                "stream": False,
+                "max_tokens": 256
+            }
+        )
+        
+        result = response.json()
+        
+        if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
+            print("No tool call - skipping test")
+            return
+        
+        tool_call = result["choices"][0]["message"]["tool_calls"][0]
+        tc_id = tool_call["id"]
+        
+        print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
+        print(f"[{timestamp()}] Tool call ID: {tc_id}")
+        
+        # Add tool response
+        messages.append({
+            "role": "assistant",
+            "tool_calls": [tool_call]
+        })
+        messages.append({
+            "role": "tool",
+            "tool_call_id": tc_id,
+            "content": "The current time is 3:45 PM on Thursday, April 9, 2026."
+        })
+        
+        # Debug: print the full messages array we're about to send
+        print(f"\n[{timestamp()}] Sending follow-up with these messages:")
+        print(json.dumps(messages, indent=2))
+        
+        # Get follow-up
+        response2 = client.post(
+            f"{API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": MODEL,
+                "messages": messages,
+                "tools": tools,
+                "stream": False,
+                "max_tokens": 256
+            }
+        )
+        
+        result2 = response2.json()
+        print(f"\n[{timestamp()}] Full response:")
+        print(json.dumps(result2, indent=2))
+        
+        if result2.get("choices"):
+            content = result2["choices"][0].get("message", {}).get("content", "")
+            
+            print(f"\n[{timestamp()}] Model response content: {content}")
+            
+            # Check if time is mentioned
+            if "3:45" in content or "3:45 PM" in content:
+                print("\n✓ Model used the tool response (time mentioned)")
+            else:
+                print("\n✗ Model may not have seen the tool response (time not mentioned)")
+
+
+def main():
+    print("\n" + "="*60)
+    print("GLM-5.1 Tool Call Response Tests")
+    print("="*60)
+    
+    # Test non-streaming first (simpler to debug)
+    print("\n--- Test 1: Non-streaming tool response flow ---")
+    test_tool_call_response_flow(streaming=False)
+    
+    # Test streaming
+    print("\n--- Test 2: Streaming tool response flow ---")
+    test_tool_call_response_flow(streaming=True)
+    
+    # Debug test
+    print("\n--- Test 3: Debug info test ---")
+    test_tool_response_with_debug_info()
+    
+    print("\nAll tests complete.")
+
+
+if __name__ == "__main__":
+    main()