add test_devstral.py, restore chat_template_kwargs+logprobs to all tests (vLLM-compat spec)

2026-04-12 20:27:44 +00:00
parent 1186c9d816
commit b285ffc2d5
4 changed files with 527 additions and 12 deletions
--- a/test_devstral.py
+++ b/test_devstral.py
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""
+Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
+
+These tests send EXACTLY what OpenClaw would send to vLLM — including
+chat_template_kwargs, logprobs, weird tool schemas, the works.
+The middleware's job is to strip/fix all of it so SGLang doesn't choke.
+
+Architecture:  this test → middleware (strips bad params) → SGLang
+"""
+
+import os
+import time
+import json
+import httpx
+from datetime import datetime
+
+# Point at the middleware, NOT SGLang directly
+API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
+API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
+MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
+
+RESULTS = []
+
+
+def ts():
+    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
+
+
+def record(name, ok, detail=""):
+    status = "✓ PASS" if ok else "✗ FAIL"
+    print(f"\n{status}: {name}")
+    if detail:
+        print(f"  {detail}")
+    RESULTS.append({"name": name, "pass": ok, "detail": detail})
+
+
+def make_client():
+    return httpx.Client(
+        timeout=120.0,
+        headers={
+            "Authorization": f"Bearer {API_KEY}",
+            "Content-Type": "application/json",
+        },
+    )
+
+
+# ── 1. Basic non-streaming chat ──────────────────────────────
+
+def test_basic_nonstream():
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Basic non-streaming chat")
+    print(f"{'='*60}")
+
+    with make_client() as c:
+        r = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "Say hello in one word."}],
+            "stream": False,
+            "max_tokens": 32,
+        })
+        print(f"[{ts()}] Status: {r.status_code}")
+        body = r.json()
+        if r.status_code != 200:
+            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
+            record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
+            return
+        content = body["choices"][0]["message"]["content"]
+        print(f"[{ts()}] Reply: {content[:100]}")
+        record("basic non-stream", True, f"Got: {content[:80]}")
+
+
+# ── 2. Basic streaming chat ──────────────────────────────────
+
+def test_basic_stream():
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Basic streaming chat")
+    print(f"{'='*60}")
+
+    with make_client() as c:
+        with c.stream("POST", f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "Count from 1 to 5."}],
+            "stream": True,
+            "max_tokens": 64,
+        }) as r:
+            print(f"[{ts()}] Status: {r.status_code}")
+            if r.status_code != 200:
+                body = "".join(r.iter_lines())
+                print(f"[{ts()}] Error: {body[:300]}")
+                record("basic stream", False, f"HTTP {r.status_code}")
+                return
+            full = ""
+            for line in r.iter_lines():
+                if not line or line == "data: [DONE]":
+                    continue
+                if line.startswith("data: "):
+                    try:
+                        chunk = json.loads(line[6:])
+                        if not chunk.get("choices"): continue
+                        delta = chunk["choices"][0].get("delta", {})
+                        if delta.get("content"):
+                            full += delta["content"]
+                    except json.JSONDecodeError:
+                        pass
+            print(f"[{ts()}] Reply: {full[:100]}")
+            record("basic stream", True, f"Got: {full[:80]}")
+
+
+# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
+
+def test_toolcall_nonstream():
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
+    print(f"{'='*60}")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
+                },
+                "required": ["location"]
+            }
+        }
+    }]
+
+    with make_client() as c:
+        r = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
+            "tools": tools,
+            "tool_choice": "auto",
+            "stream": False,
+            "max_tokens": 256,
+        })
+        print(f"[{ts()}] Status: {r.status_code}")
+        body = r.json()
+        if r.status_code != 200:
+            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
+            record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
+            return
+        msg = body["choices"][0]["message"]
+        if msg.get("tool_calls"):
+            tc = msg["tool_calls"][0]
+            print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
+            record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
+        else:
+            content = msg.get("content", "")
+            print(f"[{ts()}] No tool call. Content: {content[:200]}")
+            record("tool call non-stream", False, "Model did not call the tool")
+
+
+# ── 4. Tool call — streaming ────────────────────────────────
+
+def test_toolcall_stream():
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Tool call streaming")
+    print(f"{'='*60}")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
+                },
+                "required": ["location"]
+            }
+        }
+    }]
+
+    with make_client() as c:
+        with c.stream("POST", f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
+            "tools": tools,
+            "tool_choice": "auto",
+            "stream": True,
+            "max_tokens": 256,
+        }) as r:
+            print(f"[{ts()}] Status: {r.status_code}")
+            if r.status_code != 200:
+                body = "".join(r.iter_lines())
+                print(f"[{ts()}] Error: {body[:300]}")
+                record("tool call stream", False, f"HTTP {r.status_code}")
+                return
+            tool_name = None
+            accumulated_args = ""
+            content_parts = ""
+            for line in r.iter_lines():
+                if not line or line == "data: [DONE]":
+                    continue
+                if line.startswith("data: "):
+                    try:
+                        chunk = json.loads(line[6:])
+                        if not chunk.get("choices"): continue
+                        delta = chunk["choices"][0].get("delta", {})
+                        if delta.get("tool_calls"):
+                            for tc in delta["tool_calls"]:
+                                if tc.get("function", {}).get("name"):
+                                    tool_name = tc["function"]["name"]
+                                if tc.get("function", {}).get("arguments"):
+                                    accumulated_args += tc["function"]["arguments"]
+                        if delta.get("content"):
+                            content_parts += delta["content"]
+                    except json.JSONDecodeError:
+                        pass
+
+            if tool_name:
+                print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
+                record("tool call stream", True, f"Got tool call: {tool_name}")
+            else:
+                print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
+                record("tool call stream", False, "Model did not call the tool")
+
+
+# ── 5. Full tool response flow (non-streaming) ──────────────
+
+def test_tool_response_flow():
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
+    print(f"{'='*60}")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
+                },
+                "required": ["location"]
+            }
+        }
+    }]
+
+    messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
+
+    with make_client() as c:
+        r = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": messages,
+            "tools": tools,
+            "tool_choice": "auto",
+            "stream": False,
+            "max_tokens": 256,
+        })
+        body = r.json()
+        if r.status_code != 200:
+            record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
+            return
+        msg = body["choices"][0]["message"]
+        if not msg.get("tool_calls"):
+            record("tool response flow", False, "No tool call in step 1")
+            return
+
+        tc = msg["tool_calls"][0]
+        tc_id = tc["id"]
+        print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
+
+        messages.append(msg)
+        messages.append({
+            "role": "tool",
+            "tool_call_id": tc_id,
+            "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
+        })
+
+        r2 = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": messages,
+            "tools": tools,
+            "stream": False,
+            "max_tokens": 256,
+        })
+        body2 = r2.json()
+        if r2.status_code != 200:
+            print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
+            record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
+            return
+
+        final = body2["choices"][0]["message"].get("content", "")
+        print(f"[{ts()}] Final: {final[:200]}")
+        ok = "22" in final
+        record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
+
+
+# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
+
+def test_param_sweep():
+    """
+    Sends EVERY param that OpenClaw or vLLM might include.
+    The middleware must strip/fix the ones SGLang rejects.
+    """
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
+    print(f"{'='*60}")
+
+    base_req = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": "Say hi."}],
+        "stream": False,
+        "max_tokens": 32,
+    }
+
+    # Params that OpenClaw/vLLM might send — some SGLang rejects
+    extra_params = [
+        ("chat_template_kwargs", {"enable_thinking": False}),
+        ("guided_json", None),
+        ("guided_regex", None),
+        ("response_format", {"type": "json_object"}),
+        ("n", 1),
+        ("presence_penalty", 0.0),
+        ("frequency_penalty", 0.0),
+        ("top_p", 1.0),
+        ("temperature", 0.7),
+        ("seed", 42),
+        ("stop", ["\n"]),
+        ("logprobs", True),
+        ("top_logprobs", 5),
+    ]
+
+    with make_client() as c:
+        # baseline
+        r = c.post(f"{API_BASE}/chat/completions", json=base_req)
+        print(f"[{ts()}] Baseline: {r.status_code}")
+
+        for name, val in extra_params:
+            req = {**base_req, name: val}
+            r = c.post(f"{API_BASE}/chat/completions", json=req)
+            status = "✓" if r.status_code == 200 else "✗"
+            detail = ""
+            if r.status_code != 200:
+                try:
+                    detail = r.json().get("error", {}).get("message", "")[:100]
+                except Exception:
+                    detail = r.text[:100]
+            print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
+            if r.status_code != 200:
+                record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
+
+
+# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
+
+def test_openclaw_tool_schema():
+    """
+    Reproduce the exact tool schema that OpenClaw sends which has
+    parameters.properties = [] instead of {}. Middleware must fix it.
+    """
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
+    print(f"{'='*60}")
+
+    # This is the exact shape OpenClaw sends for tools with no params
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "web_search",
+            "description": "Search the web",
+            "parameters": {
+                "type": "object",
+                "properties": []  # <-- THIS is what causes the 400
+            }
+        }
+    }]
+
+    with make_client() as c:
+        r = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "Search for cats"}],
+            "tools": tools,
+            "tool_choice": "auto",
+            "stream": False,
+            "max_tokens": 128,
+        })
+        print(f"[{ts()}] Status: {r.status_code}")
+        body = r.json()
+        if r.status_code != 200:
+            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
+            record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
+            return
+        print(f"[{ts()}] Success — middleware fixed the bad schema")
+        record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
+
+
+# ── 8. OpenClaw full payload (chat_template_kwargs + tools) ─
+
+def test_openclaw_full_payload():
+    """
+    The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
+    Exactly what OpenClaw sends through the pipe.
+    """
+    print(f"\n{'='*60}")
+    print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
+    print(f"{'='*60}")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "web_search",
+            "description": "Search the web using DuckDuckGo.",
+            "parameters": {
+                "type": "object",
+                "properties": []  # Bad — middleware must fix
+            }
+        }
+    }]
+
+    with make_client() as c:
+        r = c.post(f"{API_BASE}/chat/completions", json={
+            "model": MODEL,
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Search for the weather in NYC"},
+            ],
+            "tools": tools,
+            "tool_choice": "auto",
+            "stream": False,
+            "max_tokens": 256,
+            "chat_template_kwargs": {"enable_thinking": False},  # Bad — middleware must strip
+            "logprobs": True,                                      # Bad — middleware must strip
+            "top_logprobs": 5,                                     # Bad — middleware must strip
+        })
+        print(f"[{ts()}] Status: {r.status_code}")
+        body = r.json()
+        if r.status_code != 200:
+            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
+            record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
+            return
+        msg = body["choices"][0]["message"]
+        print(f"[{ts()}] Success — middleware cleaned everything")
+        if msg.get("tool_calls"):
+            tc = msg["tool_calls"][0]
+            print(f"[{ts()}] Tool call: {tc['function']['name']}")
+        else:
+            print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
+        record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
+
+
+# ── Main ─────────────────────────────────────────────────────
+
+def main():
+    print(f"\n{'='*60}")
+    print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
+    print(f"API: {API_BASE}")
+    print(f"Model: {MODEL}")
+    print(f"{'='*60}")
+
+    test_basic_nonstream()
+    test_basic_stream()
+    test_toolcall_nonstream()
+    test_toolcall_stream()
+    test_tool_response_flow()
+    test_param_sweep()
+    test_openclaw_tool_schema()
+    test_openclaw_full_payload()
+
+    print(f"\n\n{'='*60}")
+    print("FINAL RESULTS")
+    print(f"{'='*60}")
+    for r in RESULTS:
+        s = "✓" if r["pass"] else "✗"
+        print(f"  {s} {r['name']}: {r['detail']}")
+    passed = sum(1 for r in RESULTS if r["pass"])
+    print(f"\n  {passed}/{len(RESULTS)} passed")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
--- a/test_streaming_tool_calls.py
+++ b/test_streaming_tool_calls.py
@@ -91,7 +91,10 @@ def test_streaming_tool_call_with_code():
                "tools": tools,
                "tool_choice": "auto",
                "stream": True,
-                "max_tokens": 4096
+                "max_tokens": 4096,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        ) as response:
            print(f"[{timestamp()}] Response status: {response.status_code}")
@@ -242,7 +245,10 @@ def test_streaming_tool_call_with_json():
                "tools": tools,
                "tool_choice": "auto",
                "stream": True,
-                "max_tokens": 2048
+                "max_tokens": 2048,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        ) as response:
            for line in response.iter_lines():
@@ -328,7 +334,10 @@ def test_non_streaming_tool_call():
                "tools": tools,
                "tool_choice": "auto",
                "stream": False,
-                "max_tokens": 1024
+                "max_tokens": 1024,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        )
        
--- a/test_tool_diagnosis.py
+++ b/test_tool_diagnosis.py
@@ -63,7 +63,10 @@ def test_simple_tool_response():
                "messages": messages,
                "tools": tools,
                "stream": False,
-                "max_tokens": 256
+                "max_tokens": 256,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        )
        
@@ -129,7 +132,10 @@ def test_without_tools_param():
                "messages": messages,
                # No tools param
                "stream": False,
-                "max_tokens": 256
+                "max_tokens": 256,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        )
        
@@ -211,7 +217,10 @@ def test_different_content_formats():
                    "messages": msgs,
                    "tools": tools,
                    "stream": False,
-                    "max_tokens": 128
+                    "max_tokens": 128,
+                    "chat_template_kwargs": {"enable_thinking": False},
+                    "logprobs": True,
+                    "top_logprobs": 5
                }
            )
            
--- a/test_tool_response.py
+++ b/test_tool_response.py
@@ -94,7 +94,10 @@ def test_tool_call_response_flow(streaming: bool = True):
                    "tools": tools,
                    "tool_choice": "auto",
                    "stream": True,
-                    "max_tokens": 512
+                    "max_tokens": 512,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
                }
            ) as response:
                print(f"[{timestamp()}] Response status: {response.status_code}")
@@ -152,7 +155,10 @@ def test_tool_call_response_flow(streaming: bool = True):
                    "tools": tools,
                    "tool_choice": "auto",
                    "stream": False,
-                    "max_tokens": 512
+                    "max_tokens": 512,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
                }
            )
            
@@ -224,7 +230,10 @@ def test_tool_call_response_flow(streaming: bool = True):
                    "messages": messages,
                    "tools": tools,
                    "stream": True,
-                    "max_tokens": 512
+                    "max_tokens": 512,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
                }
            ) as response:
                for line in response.iter_lines():
@@ -258,7 +267,10 @@ def test_tool_call_response_flow(streaming: bool = True):
                    "messages": messages,
                    "tools": tools,
                    "stream": False,
-                    "max_tokens": 512
+                    "max_tokens": 512,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
                }
            )
            
@@ -358,7 +370,10 @@ def test_tool_response_with_debug_info():
                "tools": tools,
                "tool_choice": "auto",
                "stream": False,
-                "max_tokens": 256
+                "max_tokens": 256,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        )
        
@@ -401,7 +416,10 @@ def test_tool_response_with_debug_info():
                "messages": messages,
                "tools": tools,
                "stream": False,
-                "max_tokens": 256
+                "max_tokens": 256,
+                "chat_template_kwargs": {"enable_thinking": False},
+                "logprobs": True,
+                "top_logprobs": 5
            }
        )