#!/usr/bin/env python3 """ Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware. These tests send EXACTLY what OpenClaw would send to vLLM — including chat_template_kwargs, logprobs, weird tool schemas, the works. The middleware's job is to strip/fix all of it so SGLang doesn't choke. Architecture: this test → middleware (strips bad params) → SGLang """ import os import time import json import httpx from datetime import datetime from pathlib import Path # Load .env if present (don't hardcode keys) _env_file = Path(__file__).parent / ".env" if _env_file.exists(): for line in _env_file.read_text().splitlines(): line = line.strip() if not line or line.startswith("#") or "=" not in line: continue k, v = line.split("=", 1) os.environ.setdefault(k.strip(), v.strip()) API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1") API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever") MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512") RESULTS = [] def ts(): return datetime.now().strftime("%H:%M:%S.%f")[:-3] def record(name, ok, detail=""): status = "✓ PASS" if ok else "✗ FAIL" print(f"\n{status}: {name}") if detail: print(f" {detail}") RESULTS.append({"name": name, "pass": ok, "detail": detail}) def make_client(): return httpx.Client( timeout=120.0, headers={ "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json", }, ) # ── 1. Basic non-streaming chat ────────────────────────────── def test_basic_nonstream(): print(f"\n{'='*60}") print(f"[{ts()}] TEST: Basic non-streaming chat") print(f"{'='*60}") with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "Say hello in one word."}], "stream": False, "max_tokens": 32, }) print(f"[{ts()}] Status: {r.status_code}") body = r.json() if r.status_code != 200: print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") return content = body["choices"][0]["message"]["content"] print(f"[{ts()}] Reply: {content[:100]}") record("basic non-stream", True, f"Got: {content[:80]}") # ── 2. Basic streaming chat ────────────────────────────────── def test_basic_stream(): print(f"\n{'='*60}") print(f"[{ts()}] TEST: Basic streaming chat") print(f"{'='*60}") with make_client() as c: with c.stream("POST", f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "Count from 1 to 5."}], "stream": True, "max_tokens": 64, }) as r: print(f"[{ts()}] Status: {r.status_code}") if r.status_code != 200: body = "".join(r.iter_lines()) print(f"[{ts()}] Error: {body[:300]}") record("basic stream", False, f"HTTP {r.status_code}") return full = "" for line in r.iter_lines(): if not line or line == "data: [DONE]": continue if line.startswith("data: "): try: chunk = json.loads(line[6:]) if not chunk.get("choices"): continue delta = chunk["choices"][0].get("delta", {}) if delta.get("content"): full += delta["content"] except json.JSONDecodeError: pass print(f"[{ts()}] Reply: {full[:100]}") record("basic stream", True, f"Got: {full[:80]}") # ── 3. Tool call — non-streaming (vLLM-style tool schema) ─── def test_toolcall_nonstream(): print(f"\n{'='*60}") print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)") print(f"{'='*60}") tools = [{ "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} }, "required": ["location"] } } }] with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], "tools": tools, "tool_choice": "auto", "stream": False, "max_tokens": 256, }) print(f"[{ts()}] Status: {r.status_code}") body = r.json() if r.status_code != 200: print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") return msg = body["choices"][0]["message"] if msg.get("tool_calls"): tc = msg["tool_calls"][0] print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}") record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}") else: content = msg.get("content", "") print(f"[{ts()}] No tool call. Content: {content[:200]}") record("tool call non-stream", False, "Model did not call the tool") # ── 4. Tool call — streaming ──────────────────────────────── def test_toolcall_stream(): print(f"\n{'='*60}") print(f"[{ts()}] TEST: Tool call streaming") print(f"{'='*60}") tools = [{ "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} }, "required": ["location"] } } }] with make_client() as c: with c.stream("POST", f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], "tools": tools, "tool_choice": "auto", "stream": True, "max_tokens": 256, }) as r: print(f"[{ts()}] Status: {r.status_code}") if r.status_code != 200: body = "".join(r.iter_lines()) print(f"[{ts()}] Error: {body[:300]}") record("tool call stream", False, f"HTTP {r.status_code}") return tool_name = None accumulated_args = "" content_parts = "" for line in r.iter_lines(): if not line or line == "data: [DONE]": continue if line.startswith("data: "): try: chunk = json.loads(line[6:]) if not chunk.get("choices"): continue delta = chunk["choices"][0].get("delta", {}) if delta.get("tool_calls"): for tc in delta["tool_calls"]: if tc.get("function", {}).get("name"): tool_name = tc["function"]["name"] if tc.get("function", {}).get("arguments"): accumulated_args += tc["function"]["arguments"] if delta.get("content"): content_parts += delta["content"] except json.JSONDecodeError: pass if tool_name: print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}") record("tool call stream", True, f"Got tool call: {tool_name}") else: print(f"[{ts()}] No tool call. Content: {content_parts[:200]}") record("tool call stream", False, "Model did not call the tool") # ── 5. Full tool response flow (non-streaming) ────────────── def test_tool_response_flow(): print(f"\n{'='*60}") print(f"[{ts()}] TEST: Full tool response flow (non-streaming)") print(f"{'='*60}") tools = [{ "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} }, "required": ["location"] } } }] messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": messages, "tools": tools, "tool_choice": "auto", "stream": False, "max_tokens": 256, }) body = r.json() if r.status_code != 200: record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}") return msg = body["choices"][0]["message"] if not msg.get("tool_calls"): record("tool response flow", False, "No tool call in step 1") return tc = msg["tool_calls"][0] tc_id = tc["id"] print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})") messages.append(msg) messages.append({ "role": "tool", "tool_call_id": tc_id, "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}), }) r2 = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": messages, "tools": tools, "stream": False, "max_tokens": 256, }) body2 = r2.json() if r2.status_code != 200: print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}") record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}") return final = body2["choices"][0]["message"].get("content", "") print(f"[{ts()}] Final: {final[:200]}") ok = "22" in final record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}") # ── 6. Param sweep — everything OpenClaw/vLLM sends ───────── def test_param_sweep(): """ Sends EVERY param that OpenClaw or vLLM might include. The middleware must strip/fix the ones SGLang rejects. """ print(f"\n{'='*60}") print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)") print(f"{'='*60}") base_req = { "model": MODEL, "messages": [{"role": "user", "content": "Say hi."}], "stream": False, "max_tokens": 32, } # Params that OpenClaw/vLLM might send — some SGLang rejects extra_params = [ ("chat_template_kwargs", {"enable_thinking": False}), ("guided_json", None), ("guided_regex", None), ("response_format", {"type": "json_object"}), ("n", 1), ("presence_penalty", 0.0), ("frequency_penalty", 0.0), ("top_p", 1.0), ("temperature", 0.7), ("seed", 42), ("stop", ["\n"]), ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}), ("top_logprobs", 5), ] with make_client() as c: # baseline r = c.post(f"{API_BASE}/chat/completions", json=base_req) print(f"[{ts()}] Baseline: {r.status_code}") for name, val in extra_params: req = {**base_req, name: val} r = c.post(f"{API_BASE}/chat/completions", json=req) status = "✓" if r.status_code == 200 else "✗" detail = "" if r.status_code != 200: try: detail = r.json().get("error", {}).get("message", "")[:100] except Exception: detail = r.text[:100] print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}") if r.status_code != 200: record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}") # ── 7. OpenClaw-style tool schema (the one that caused 400) ─ def test_openclaw_tool_schema(): """ Reproduce the exact tool schema that OpenClaw sends which has parameters.properties = [] instead of {}. Middleware must fix it. """ print(f"\n{'='*60}") print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)") print(f"{'='*60}") # This is the exact shape OpenClaw sends for tools with no params tools = [{ "type": "function", "function": { "name": "web_search", "description": "Search the web", "parameters": { "type": "object", "properties": [] # <-- THIS is what causes the 400 } } }] with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "Search for cats"}], "tools": tools, "tool_choice": "auto", "stream": False, "max_tokens": 128, }) print(f"[{ts()}] Status: {r.status_code}") body = r.json() if r.status_code != 200: print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") return print(f"[{ts()}] Success — middleware fixed the bad schema") record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}") # ── 8. Nested properties=[] in tool schema (Tool 21 bug) ──── def test_nested_bad_properties(): """ Reproduce the exact Tool 21 400 error: schema['properties']['fields']['items']['properties'] = [] This happens when a tool has an array-of-objects parameter where the items' properties field is [] instead of {}. The middleware must recurse into the schema to fix ALL properties fields. """ print(f"\n{'='*60}") print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)") print(f"{'='*60}") # This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema: # [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']" tools = [{ "type": "function", "function": { "name": "message", "description": "Send a message", "parameters": { "type": "object", "properties": { "fields": { "type": "array", "items": { "type": "object", "properties": [] # <-- THIS causes the 400 } } } } } }] with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": "Send a message to Bob"}], "tools": tools, "tool_choice": "auto", "stream": False, "max_tokens": 128, }) print(f"[{ts()}] Status: {r.status_code}") body = r.json() if r.status_code != 200: print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}") record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") return print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}") record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}") # ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─ def test_openclaw_full_payload(): """ The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas. Exactly what OpenClaw sends through the pipe. """ print(f"\n{'='*60}") print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)") print(f"{'='*60}") tools = [{ "type": "function", "function": { "name": "web_search", "description": "Search the web using DuckDuckGo.", "parameters": { "type": "object", "properties": [] # Bad — middleware must fix } } }] with make_client() as c: r = c.post(f"{API_BASE}/chat/completions", json={ "model": MODEL, "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Search for the weather in NYC"}, ], "tools": tools, "tool_choice": "auto", "stream": False, "max_tokens": 256, "chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip "logprobs": True, # Bad — middleware must strip "top_logprobs": 5, # Bad — middleware must strip }) print(f"[{ts()}] Status: {r.status_code}") body = r.json() if r.status_code != 200: print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") return msg = body["choices"][0]["message"] print(f"[{ts()}] Success — middleware cleaned everything") if msg.get("tool_calls"): tc = msg["tool_calls"][0] print(f"[{ts()}] Tool call: {tc['function']['name']}") else: print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}") record("openclaw full payload", True, "Full OpenClaw payload survived the middleware") # ── Main ───────────────────────────────────────────────────── def main(): print(f"\n{'='*60}") print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)") print(f"API: {API_BASE}") print(f"Model: {MODEL}") print(f"{'='*60}") test_basic_nonstream() test_basic_stream() test_toolcall_nonstream() test_toolcall_stream() test_tool_response_flow() test_param_sweep() test_openclaw_tool_schema() test_nested_bad_properties() test_openclaw_full_payload() print(f"\n\n{'='*60}") print("FINAL RESULTS") print(f"{'='*60}") for r in RESULTS: s = "✓" if r["pass"] else "✗" print(f" {s} {r['name']}: {r['detail']}") passed = sum(1 for r in RESULTS if r["pass"]) print(f"\n {passed}/{len(RESULTS)} passed") print(f"{'='*60}") if __name__ == "__main__": main()