vllm-glm/tests/test_tool_diagnosis.py

#!/usr/bin/env python3
"""
Focused test to diagnose GLM-5.1 tool response issue.

The issue: Model sees tool response as blank.
"""

import httpx
import json

API_BASE = "https://api.vultrinference.com/v1"
API_KEY = "26DN7PNUB3YRBEPCDNMXKKD6ZODMETRSMOZQ"
MODEL = "zai-org/GLM-5.1-FP8"


def test_simple_tool_response():
    """
    Minimal test: Send a tool response and see if the model can use it.
    """

    # Simulate a conversation where a tool was called
    messages = [
        {"role": "user", "content": "Call the test function"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "test_func", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "SUCCESS: The function returned value 42"
        }
    ]

    tools = [{
        "type": "function",
        "function": {
            "name": "test_func",
            "description": "A test function",
            "parameters": {"type": "object", "properties": {}}
        }
    }]

    print("=" * 60)
    print("Request messages:")
    print(json.dumps(messages, indent=2))
    print("=" * 60)

    with httpx.Client(timeout=60.0) as client:
        # Non-streaming to get full response
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "stream": False,
                "max_tokens": 256
            }
        )

        result = response.json()

        print("\nFull response:")
        print(json.dumps(result, indent=2))

        if result.get("choices"):
            content = result["choices"][0].get("message", {}).get("content", "")
            print("\n" + "=" * 60)
            print("Model response content:")
            print(content)
            print("=" * 60)

            # Check if the tool result is referenced
            if "42" in content:
                print("\n✓ PASS: Model referenced the tool result (42)")
            else:
                print("\n✗ FAIL: Model did NOT reference the tool result (42)")

            # Check for signs the model didn't see the result
            if "don't have" in content.lower() or "cannot access" in content.lower():
                print("✗ Model indicates it cannot see tool result")


def test_without_tools_param():
    """
    Test what happens if we don't pass tools in the follow-up request.
    Some APIs need tools to be passed on every request.
    """

    messages = [
        {"role": "user", "content": "Call the test function"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "test_func", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "SUCCESS: The function returned value 42"
        }
    ]

    print("\n" + "=" * 60)
    print("Test WITHOUT tools param in follow-up")
    print("=" * 60)

    with httpx.Client(timeout=60.0) as client:
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                # No tools param
                "stream": False,
                "max_tokens": 256
            }
        )

        result = response.json()

        if result.get("choices"):
            content = result["choices"][0].get("message", {}).get("content", "")
            print("Model response:", content[:200])

            if "42" in content:
                print("✓ Model referenced the tool result")


def test_different_content_formats():
    """
    Test if the issue is with how content is formatted.
    """

    # Test 1: String content (standard)
    messages_string = [
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "calc", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "The answer is 4"
        }
    ]

    # Test 2: Content as array (OpenAI format)
    messages_array = [
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "calc", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": [{"type": "text", "text": "The answer is 4"}]
        }
    ]

    tools = [{
        "type": "function",
        "function": {
            "name": "calc",
            "description": "Calculator",
            "parameters": {"type": "object", "properties": {}}
        }
    }]

    print("\n" + "=" * 60)
    print("Test: String content vs Array content")
    print("=" * 60)

    with httpx.Client(timeout=60.0) as client:
        for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
            print(f"\n--- {name} ---")
            response = client.post(
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": msgs,
                    "tools": tools,
                    "stream": False,
                    "max_tokens": 128
                }
            )

            result = response.json()
            if result.get("choices"):
                content = result["choices"][0].get("message", {}).get("content", "")
                print(f"Response: {content[:150]}")
                if "4" in content:
                    print("✓ Referenced tool result")
                else:
                    print("✗ Did NOT reference tool result")


if __name__ == "__main__":
    print("GLM-5.1 Tool Response Diagnosis")
    print("=" * 60)

    test_simple_tool_response()
    test_without_tools_param()
    test_different_content_formats()