diff --git a/test_devstral.py b/test_devstral.py new file mode 100644 index 0000000..10c0afb --- /dev/null +++ b/test_devstral.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 +""" +Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware. + +These tests send EXACTLY what OpenClaw would send to vLLM — including +chat_template_kwargs, logprobs, weird tool schemas, the works. +The middleware's job is to strip/fix all of it so SGLang doesn't choke. + +Architecture: this test → middleware (strips bad params) → SGLang +""" + +import os +import time +import json +import httpx +from datetime import datetime + +# Point at the middleware, NOT SGLang directly +API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1") +API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever") +MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512") + +RESULTS = [] + + +def ts(): + return datetime.now().strftime("%H:%M:%S.%f")[:-3] + + +def record(name, ok, detail=""): + status = "✓ PASS" if ok else "✗ FAIL" + print(f"\n{status}: {name}") + if detail: + print(f" {detail}") + RESULTS.append({"name": name, "pass": ok, "detail": detail}) + + +def make_client(): + return httpx.Client( + timeout=120.0, + headers={ + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + }, + ) + + +# ── 1. Basic non-streaming chat ────────────────────────────── + +def test_basic_nonstream(): + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Basic non-streaming chat") + print(f"{'='*60}") + + with make_client() as c: + r = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [{"role": "user", "content": "Say hello in one word."}], + "stream": False, + "max_tokens": 32, + }) + print(f"[{ts()}] Status: {r.status_code}") + body = r.json() + if r.status_code != 200: + print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") + record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") + return + content = body["choices"][0]["message"]["content"] + print(f"[{ts()}] Reply: {content[:100]}") + record("basic non-stream", True, f"Got: {content[:80]}") + + +# ── 2. Basic streaming chat ────────────────────────────────── + +def test_basic_stream(): + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Basic streaming chat") + print(f"{'='*60}") + + with make_client() as c: + with c.stream("POST", f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [{"role": "user", "content": "Count from 1 to 5."}], + "stream": True, + "max_tokens": 64, + }) as r: + print(f"[{ts()}] Status: {r.status_code}") + if r.status_code != 200: + body = "".join(r.iter_lines()) + print(f"[{ts()}] Error: {body[:300]}") + record("basic stream", False, f"HTTP {r.status_code}") + return + full = "" + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + if not chunk.get("choices"): continue + delta = chunk["choices"][0].get("delta", {}) + if delta.get("content"): + full += delta["content"] + except json.JSONDecodeError: + pass + print(f"[{ts()}] Reply: {full[:100]}") + record("basic stream", True, f"Got: {full[:80]}") + + +# ── 3. Tool call — non-streaming (vLLM-style tool schema) ─── + +def test_toolcall_nonstream(): + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)") + print(f"{'='*60}") + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} + }, + "required": ["location"] + } + } + }] + + with make_client() as c: + r = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], + "tools": tools, + "tool_choice": "auto", + "stream": False, + "max_tokens": 256, + }) + print(f"[{ts()}] Status: {r.status_code}") + body = r.json() + if r.status_code != 200: + print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") + record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") + return + msg = body["choices"][0]["message"] + if msg.get("tool_calls"): + tc = msg["tool_calls"][0] + print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}") + record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}") + else: + content = msg.get("content", "") + print(f"[{ts()}] No tool call. Content: {content[:200]}") + record("tool call non-stream", False, "Model did not call the tool") + + +# ── 4. Tool call — streaming ──────────────────────────────── + +def test_toolcall_stream(): + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Tool call streaming") + print(f"{'='*60}") + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} + }, + "required": ["location"] + } + } + }] + + with make_client() as c: + with c.stream("POST", f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], + "tools": tools, + "tool_choice": "auto", + "stream": True, + "max_tokens": 256, + }) as r: + print(f"[{ts()}] Status: {r.status_code}") + if r.status_code != 200: + body = "".join(r.iter_lines()) + print(f"[{ts()}] Error: {body[:300]}") + record("tool call stream", False, f"HTTP {r.status_code}") + return + tool_name = None + accumulated_args = "" + content_parts = "" + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + if not chunk.get("choices"): continue + delta = chunk["choices"][0].get("delta", {}) + if delta.get("tool_calls"): + for tc in delta["tool_calls"]: + if tc.get("function", {}).get("name"): + tool_name = tc["function"]["name"] + if tc.get("function", {}).get("arguments"): + accumulated_args += tc["function"]["arguments"] + if delta.get("content"): + content_parts += delta["content"] + except json.JSONDecodeError: + pass + + if tool_name: + print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}") + record("tool call stream", True, f"Got tool call: {tool_name}") + else: + print(f"[{ts()}] No tool call. Content: {content_parts[:200]}") + record("tool call stream", False, "Model did not call the tool") + + +# ── 5. Full tool response flow (non-streaming) ────────────── + +def test_tool_response_flow(): + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Full tool response flow (non-streaming)") + print(f"{'='*60}") + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} + }, + "required": ["location"] + } + } + }] + + messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + + with make_client() as c: + r = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": messages, + "tools": tools, + "tool_choice": "auto", + "stream": False, + "max_tokens": 256, + }) + body = r.json() + if r.status_code != 200: + record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}") + return + msg = body["choices"][0]["message"] + if not msg.get("tool_calls"): + record("tool response flow", False, "No tool call in step 1") + return + + tc = msg["tool_calls"][0] + tc_id = tc["id"] + print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})") + + messages.append(msg) + messages.append({ + "role": "tool", + "tool_call_id": tc_id, + "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}), + }) + + r2 = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": messages, + "tools": tools, + "stream": False, + "max_tokens": 256, + }) + body2 = r2.json() + if r2.status_code != 200: + print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}") + record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}") + return + + final = body2["choices"][0]["message"].get("content", "") + print(f"[{ts()}] Final: {final[:200]}") + ok = "22" in final + record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}") + + +# ── 6. Param sweep — everything OpenClaw/vLLM sends ───────── + +def test_param_sweep(): + """ + Sends EVERY param that OpenClaw or vLLM might include. + The middleware must strip/fix the ones SGLang rejects. + """ + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)") + print(f"{'='*60}") + + base_req = { + "model": MODEL, + "messages": [{"role": "user", "content": "Say hi."}], + "stream": False, + "max_tokens": 32, + } + + # Params that OpenClaw/vLLM might send — some SGLang rejects + extra_params = [ + ("chat_template_kwargs", {"enable_thinking": False}), + ("guided_json", None), + ("guided_regex", None), + ("response_format", {"type": "json_object"}), + ("n", 1), + ("presence_penalty", 0.0), + ("frequency_penalty", 0.0), + ("top_p", 1.0), + ("temperature", 0.7), + ("seed", 42), + ("stop", ["\n"]), + ("logprobs", True), + ("top_logprobs", 5), + ] + + with make_client() as c: + # baseline + r = c.post(f"{API_BASE}/chat/completions", json=base_req) + print(f"[{ts()}] Baseline: {r.status_code}") + + for name, val in extra_params: + req = {**base_req, name: val} + r = c.post(f"{API_BASE}/chat/completions", json=req) + status = "✓" if r.status_code == 200 else "✗" + detail = "" + if r.status_code != 200: + try: + detail = r.json().get("error", {}).get("message", "")[:100] + except Exception: + detail = r.text[:100] + print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}") + if r.status_code != 200: + record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}") + + +# ── 7. OpenClaw-style tool schema (the one that caused 400) ─ + +def test_openclaw_tool_schema(): + """ + Reproduce the exact tool schema that OpenClaw sends which has + parameters.properties = [] instead of {}. Middleware must fix it. + """ + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)") + print(f"{'='*60}") + + # This is the exact shape OpenClaw sends for tools with no params + tools = [{ + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web", + "parameters": { + "type": "object", + "properties": [] # <-- THIS is what causes the 400 + } + } + }] + + with make_client() as c: + r = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [{"role": "user", "content": "Search for cats"}], + "tools": tools, + "tool_choice": "auto", + "stream": False, + "max_tokens": 128, + }) + print(f"[{ts()}] Status: {r.status_code}") + body = r.json() + if r.status_code != 200: + print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") + record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") + return + print(f"[{ts()}] Success — middleware fixed the bad schema") + record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}") + + +# ── 8. OpenClaw full payload (chat_template_kwargs + tools) ─ + +def test_openclaw_full_payload(): + """ + The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas. + Exactly what OpenClaw sends through the pipe. + """ + print(f"\n{'='*60}") + print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)") + print(f"{'='*60}") + + tools = [{ + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web using DuckDuckGo.", + "parameters": { + "type": "object", + "properties": [] # Bad — middleware must fix + } + } + }] + + with make_client() as c: + r = c.post(f"{API_BASE}/chat/completions", json={ + "model": MODEL, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Search for the weather in NYC"}, + ], + "tools": tools, + "tool_choice": "auto", + "stream": False, + "max_tokens": 256, + "chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip + "logprobs": True, # Bad — middleware must strip + "top_logprobs": 5, # Bad — middleware must strip + }) + print(f"[{ts()}] Status: {r.status_code}") + body = r.json() + if r.status_code != 200: + print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") + record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") + return + msg = body["choices"][0]["message"] + print(f"[{ts()}] Success — middleware cleaned everything") + if msg.get("tool_calls"): + tc = msg["tool_calls"][0] + print(f"[{ts()}] Tool call: {tc['function']['name']}") + else: + print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}") + record("openclaw full payload", True, "Full OpenClaw payload survived the middleware") + + +# ── Main ───────────────────────────────────────────────────── + +def main(): + print(f"\n{'='*60}") + print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)") + print(f"API: {API_BASE}") + print(f"Model: {MODEL}") + print(f"{'='*60}") + + test_basic_nonstream() + test_basic_stream() + test_toolcall_nonstream() + test_toolcall_stream() + test_tool_response_flow() + test_param_sweep() + test_openclaw_tool_schema() + test_openclaw_full_payload() + + print(f"\n\n{'='*60}") + print("FINAL RESULTS") + print(f"{'='*60}") + for r in RESULTS: + s = "✓" if r["pass"] else "✗" + print(f" {s} {r['name']}: {r['detail']}") + passed = sum(1 for r in RESULTS if r["pass"]) + print(f"\n {passed}/{len(RESULTS)} passed") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/test_streaming_tool_calls.py b/test_streaming_tool_calls.py index 46a2199..3bddb7b 100644 --- a/test_streaming_tool_calls.py +++ b/test_streaming_tool_calls.py @@ -91,7 +91,10 @@ def test_streaming_tool_call_with_code(): "tools": tools, "tool_choice": "auto", "stream": True, - "max_tokens": 4096 + "max_tokens": 4096, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) as response: print(f"[{timestamp()}] Response status: {response.status_code}") @@ -242,7 +245,10 @@ def test_streaming_tool_call_with_json(): "tools": tools, "tool_choice": "auto", "stream": True, - "max_tokens": 2048 + "max_tokens": 2048, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) as response: for line in response.iter_lines(): @@ -328,7 +334,10 @@ def test_non_streaming_tool_call(): "tools": tools, "tool_choice": "auto", "stream": False, - "max_tokens": 1024 + "max_tokens": 1024, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) diff --git a/test_tool_diagnosis.py b/test_tool_diagnosis.py index 8c363c0..17fe4d0 100644 --- a/test_tool_diagnosis.py +++ b/test_tool_diagnosis.py @@ -63,7 +63,10 @@ def test_simple_tool_response(): "messages": messages, "tools": tools, "stream": False, - "max_tokens": 256 + "max_tokens": 256, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) @@ -129,7 +132,10 @@ def test_without_tools_param(): "messages": messages, # No tools param "stream": False, - "max_tokens": 256 + "max_tokens": 256, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) @@ -211,7 +217,10 @@ def test_different_content_formats(): "messages": msgs, "tools": tools, "stream": False, - "max_tokens": 128 + "max_tokens": 128, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) diff --git a/test_tool_response.py b/test_tool_response.py index b3cd048..b3853ab 100644 --- a/test_tool_response.py +++ b/test_tool_response.py @@ -94,7 +94,10 @@ def test_tool_call_response_flow(streaming: bool = True): "tools": tools, "tool_choice": "auto", "stream": True, - "max_tokens": 512 + "max_tokens": 512, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) as response: print(f"[{timestamp()}] Response status: {response.status_code}") @@ -152,7 +155,10 @@ def test_tool_call_response_flow(streaming: bool = True): "tools": tools, "tool_choice": "auto", "stream": False, - "max_tokens": 512 + "max_tokens": 512, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) @@ -224,7 +230,10 @@ def test_tool_call_response_flow(streaming: bool = True): "messages": messages, "tools": tools, "stream": True, - "max_tokens": 512 + "max_tokens": 512, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) as response: for line in response.iter_lines(): @@ -258,7 +267,10 @@ def test_tool_call_response_flow(streaming: bool = True): "messages": messages, "tools": tools, "stream": False, - "max_tokens": 512 + "max_tokens": 512, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) @@ -358,7 +370,10 @@ def test_tool_response_with_debug_info(): "tools": tools, "tool_choice": "auto", "stream": False, - "max_tokens": 256 + "max_tokens": 256, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } ) @@ -401,7 +416,10 @@ def test_tool_response_with_debug_info(): "messages": messages, "tools": tools, "stream": False, - "max_tokens": 256 + "max_tokens": 256, + "chat_template_kwargs": {"enable_thinking": False}, + "logprobs": True, + "top_logprobs": 5 } )