model-tool-tests/test_devstral.py

#!/usr/bin/env python3
"""
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.

These tests send EXACTLY what OpenClaw would send to vLLM — including
chat_template_kwargs, logprobs, weird tool schemas, the works.
The middleware's job is to strip/fix all of it so SGLang doesn't choke.

Architecture:  this test → middleware (strips bad params) → SGLang
"""

import os
import time
import json
import httpx
from datetime import datetime

# Point at the middleware, NOT SGLang directly
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")

RESULTS = []


def ts():
    return datetime.now().strftime("%H:%M:%S.%f")[:-3]


def record(name, ok, detail=""):
    status = "✓ PASS" if ok else "✗ FAIL"
    print(f"\n{status}: {name}")
    if detail:
        print(f"  {detail}")
    RESULTS.append({"name": name, "pass": ok, "detail": detail})


def make_client():
    return httpx.Client(
        timeout=120.0,
        headers={
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json",
        },
    )


# ── 1. Basic non-streaming chat ──────────────────────────────

def test_basic_nonstream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Basic non-streaming chat")
    print(f"{'='*60}")

    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Say hello in one word."}],
            "stream": False,
            "max_tokens": 32,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
            record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        content = body["choices"][0]["message"]["content"]
        print(f"[{ts()}] Reply: {content[:100]}")
        record("basic non-stream", True, f"Got: {content[:80]}")


# ── 2. Basic streaming chat ──────────────────────────────────

def test_basic_stream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Basic streaming chat")
    print(f"{'='*60}")

    with make_client() as c:
        with c.stream("POST", f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Count from 1 to 5."}],
            "stream": True,
            "max_tokens": 64,
        }) as r:
            print(f"[{ts()}] Status: {r.status_code}")
            if r.status_code != 200:
                body = "".join(r.iter_lines())
                print(f"[{ts()}] Error: {body[:300]}")
                record("basic stream", False, f"HTTP {r.status_code}")
                return
            full = ""
            for line in r.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    try:
                        chunk = json.loads(line[6:])
                        if not chunk.get("choices"): continue
                        delta = chunk["choices"][0].get("delta", {})
                        if delta.get("content"):
                            full += delta["content"]
                    except json.JSONDecodeError:
                        pass
            print(f"[{ts()}] Reply: {full[:100]}")
            record("basic stream", True, f"Got: {full[:80]}")


# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───

def test_toolcall_nonstream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
    print(f"{'='*60}")

    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]

    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
            record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        msg = body["choices"][0]["message"]
        if msg.get("tool_calls"):
            tc = msg["tool_calls"][0]
            print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
            record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
        else:
            content = msg.get("content", "")
            print(f"[{ts()}] No tool call. Content: {content[:200]}")
            record("tool call non-stream", False, "Model did not call the tool")


# ── 4. Tool call — streaming ────────────────────────────────

def test_toolcall_stream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Tool call streaming")
    print(f"{'='*60}")

    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]

    with make_client() as c:
        with c.stream("POST", f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": True,
            "max_tokens": 256,
        }) as r:
            print(f"[{ts()}] Status: {r.status_code}")
            if r.status_code != 200:
                body = "".join(r.iter_lines())
                print(f"[{ts()}] Error: {body[:300]}")
                record("tool call stream", False, f"HTTP {r.status_code}")
                return
            tool_name = None
            accumulated_args = ""
            content_parts = ""
            for line in r.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    try:
                        chunk = json.loads(line[6:])
                        if not chunk.get("choices"): continue
                        delta = chunk["choices"][0].get("delta", {})
                        if delta.get("tool_calls"):
                            for tc in delta["tool_calls"]:
                                if tc.get("function", {}).get("name"):
                                    tool_name = tc["function"]["name"]
                                if tc.get("function", {}).get("arguments"):
                                    accumulated_args += tc["function"]["arguments"]
                        if delta.get("content"):
                            content_parts += delta["content"]
                    except json.JSONDecodeError:
                        pass

            if tool_name:
                print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
                record("tool call stream", True, f"Got tool call: {tool_name}")
            else:
                print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
                record("tool call stream", False, "Model did not call the tool")


# ── 5. Full tool response flow (non-streaming) ──────────────

def test_tool_response_flow():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
    print(f"{'='*60}")

    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]

    messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]

    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": messages,
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
        })
        body = r.json()
        if r.status_code != 200:
            record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
            return
        msg = body["choices"][0]["message"]
        if not msg.get("tool_calls"):
            record("tool response flow", False, "No tool call in step 1")
            return

        tc = msg["tool_calls"][0]
        tc_id = tc["id"]
        print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")

        messages.append(msg)
        messages.append({
            "role": "tool",
            "tool_call_id": tc_id,
            "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
        })

        r2 = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": messages,
            "tools": tools,
            "stream": False,
            "max_tokens": 256,
        })
        body2 = r2.json()
        if r2.status_code != 200:
            print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
            record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
            return

        final = body2["choices"][0]["message"].get("content", "")
        print(f"[{ts()}] Final: {final[:200]}")
        ok = "22" in final
        record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")


# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────

def test_param_sweep():
    """
    Sends EVERY param that OpenClaw or vLLM might include.
    The middleware must strip/fix the ones SGLang rejects.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
    print(f"{'='*60}")

    base_req = {
        "model": MODEL,
        "messages": [{"role": "user", "content": "Say hi."}],
        "stream": False,
        "max_tokens": 32,
    }

    # Params that OpenClaw/vLLM might send — some SGLang rejects
    extra_params = [
        ("chat_template_kwargs", {"enable_thinking": False}),
        ("guided_json", None),
        ("guided_regex", None),
        ("response_format", {"type": "json_object"}),
        ("n", 1),
        ("presence_penalty", 0.0),
        ("frequency_penalty", 0.0),
        ("top_p", 1.0),
        ("temperature", 0.7),
        ("seed", 42),
        ("stop", ["\n"]),
        ("logprobs", True),
        ("top_logprobs", 5),
    ]

    with make_client() as c:
        # baseline
        r = c.post(f"{API_BASE}/chat/completions", json=base_req)
        print(f"[{ts()}] Baseline: {r.status_code}")

        for name, val in extra_params:
            req = {**base_req, name: val}
            r = c.post(f"{API_BASE}/chat/completions", json=req)
            status = "✓" if r.status_code == 200 else "✗"
            detail = ""
            if r.status_code != 200:
                try:
                    detail = r.json().get("error", {}).get("message", "")[:100]
                except Exception:
                    detail = r.text[:100]
            print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
            if r.status_code != 200:
                record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")


# ── 7. OpenClaw-style tool schema (the one that caused 400) ─

def test_openclaw_tool_schema():
    """
    Reproduce the exact tool schema that OpenClaw sends which has
    parameters.properties = [] instead of {}. Middleware must fix it.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
    print(f"{'='*60}")

    # This is the exact shape OpenClaw sends for tools with no params
    tools = [{
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web",
            "parameters": {
                "type": "object",
                "properties": []  # <-- THIS is what causes the 400
            }
        }
    }]

    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Search for cats"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 128,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
            record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        print(f"[{ts()}] Success — middleware fixed the bad schema")
        record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")


# ── 8. OpenClaw full payload (chat_template_kwargs + tools) ─

def test_openclaw_full_payload():
    """
    The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
    Exactly what OpenClaw sends through the pipe.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
    print(f"{'='*60}")

    tools = [{
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web using DuckDuckGo.",
            "parameters": {
                "type": "object",
                "properties": []  # Bad — middleware must fix
            }
        }
    }]

    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Search for the weather in NYC"},
            ],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
            "chat_template_kwargs": {"enable_thinking": False},  # Bad — middleware must strip
            "logprobs": True,                                      # Bad — middleware must strip
            "top_logprobs": 5,                                     # Bad — middleware must strip
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
            record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        msg = body["choices"][0]["message"]
        print(f"[{ts()}] Success — middleware cleaned everything")
        if msg.get("tool_calls"):
            tc = msg["tool_calls"][0]
            print(f"[{ts()}] Tool call: {tc['function']['name']}")
        else:
            print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
        record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")


# ── Main ─────────────────────────────────────────────────────

def main():
    print(f"\n{'='*60}")
    print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
    print(f"API: {API_BASE}")
    print(f"Model: {MODEL}")
    print(f"{'='*60}")

    test_basic_nonstream()
    test_basic_stream()
    test_toolcall_nonstream()
    test_toolcall_stream()
    test_tool_response_flow()
    test_param_sweep()
    test_openclaw_tool_schema()
    test_openclaw_full_payload()

    print(f"\n\n{'='*60}")
    print("FINAL RESULTS")
    print(f"{'='*60}")
    for r in RESULTS:
        s = "✓" if r["pass"] else "✗"
        print(f"  {s} {r['name']}: {r['detail']}")
    passed = sum(1 for r in RESULTS if r["pass"])
    print(f"\n  {passed}/{len(RESULTS)} passed")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()