Files
model-tool-tests/test_devstral.py

480 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
These tests send EXACTLY what OpenClaw would send to vLLM — including
chat_template_kwargs, logprobs, weird tool schemas, the works.
The middleware's job is to strip/fix all of it so SGLang doesn't choke.
Architecture: this test → middleware (strips bad params) → SGLang
"""
import os
import time
import json
import httpx
from datetime import datetime
# Point at the middleware, NOT SGLang directly
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
RESULTS = []
def ts():
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
def record(name, ok, detail=""):
status = "✓ PASS" if ok else "✗ FAIL"
print(f"\n{status}: {name}")
if detail:
print(f" {detail}")
RESULTS.append({"name": name, "pass": ok, "detail": detail})
def make_client():
return httpx.Client(
timeout=120.0,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
},
)
# ── 1. Basic non-streaming chat ──────────────────────────────
def test_basic_nonstream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Basic non-streaming chat")
print(f"{'='*60}")
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Say hello in one word."}],
"stream": False,
"max_tokens": 32,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
content = body["choices"][0]["message"]["content"]
print(f"[{ts()}] Reply: {content[:100]}")
record("basic non-stream", True, f"Got: {content[:80]}")
# ── 2. Basic streaming chat ──────────────────────────────────
def test_basic_stream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Basic streaming chat")
print(f"{'='*60}")
with make_client() as c:
with c.stream("POST", f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
"stream": True,
"max_tokens": 64,
}) as r:
print(f"[{ts()}] Status: {r.status_code}")
if r.status_code != 200:
body = "".join(r.iter_lines())
print(f"[{ts()}] Error: {body[:300]}")
record("basic stream", False, f"HTTP {r.status_code}")
return
full = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if not chunk.get("choices"): continue
delta = chunk["choices"][0].get("delta", {})
if delta.get("content"):
full += delta["content"]
except json.JSONDecodeError:
pass
print(f"[{ts()}] Reply: {full[:100]}")
record("basic stream", True, f"Got: {full[:80]}")
# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
def test_toolcall_nonstream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
msg = body["choices"][0]["message"]
if msg.get("tool_calls"):
tc = msg["tool_calls"][0]
print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
else:
content = msg.get("content", "")
print(f"[{ts()}] No tool call. Content: {content[:200]}")
record("tool call non-stream", False, "Model did not call the tool")
# ── 4. Tool call — streaming ────────────────────────────────
def test_toolcall_stream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Tool call streaming")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
with make_client() as c:
with c.stream("POST", f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": tools,
"tool_choice": "auto",
"stream": True,
"max_tokens": 256,
}) as r:
print(f"[{ts()}] Status: {r.status_code}")
if r.status_code != 200:
body = "".join(r.iter_lines())
print(f"[{ts()}] Error: {body[:300]}")
record("tool call stream", False, f"HTTP {r.status_code}")
return
tool_name = None
accumulated_args = ""
content_parts = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if not chunk.get("choices"): continue
delta = chunk["choices"][0].get("delta", {})
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
if tc.get("function", {}).get("name"):
tool_name = tc["function"]["name"]
if tc.get("function", {}).get("arguments"):
accumulated_args += tc["function"]["arguments"]
if delta.get("content"):
content_parts += delta["content"]
except json.JSONDecodeError:
pass
if tool_name:
print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
record("tool call stream", True, f"Got tool call: {tool_name}")
else:
print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
record("tool call stream", False, "Model did not call the tool")
# ── 5. Full tool response flow (non-streaming) ──────────────
def test_tool_response_flow():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
body = r.json()
if r.status_code != 200:
record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
return
msg = body["choices"][0]["message"]
if not msg.get("tool_calls"):
record("tool response flow", False, "No tool call in step 1")
return
tc = msg["tool_calls"][0]
tc_id = tc["id"]
print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
messages.append(msg)
messages.append({
"role": "tool",
"tool_call_id": tc_id,
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
})
r2 = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": False,
"max_tokens": 256,
})
body2 = r2.json()
if r2.status_code != 200:
print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
return
final = body2["choices"][0]["message"].get("content", "")
print(f"[{ts()}] Final: {final[:200]}")
ok = "22" in final
record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'}{final[:100]}")
# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
def test_param_sweep():
"""
Sends EVERY param that OpenClaw or vLLM might include.
The middleware must strip/fix the ones SGLang rejects.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
print(f"{'='*60}")
base_req = {
"model": MODEL,
"messages": [{"role": "user", "content": "Say hi."}],
"stream": False,
"max_tokens": 32,
}
# Params that OpenClaw/vLLM might send — some SGLang rejects
extra_params = [
("chat_template_kwargs", {"enable_thinking": False}),
("guided_json", None),
("guided_regex", None),
("response_format", {"type": "json_object"}),
("n", 1),
("presence_penalty", 0.0),
("frequency_penalty", 0.0),
("top_p", 1.0),
("temperature", 0.7),
("seed", 42),
("stop", ["\n"]),
("logprobs", True),
("top_logprobs", 5),
]
with make_client() as c:
# baseline
r = c.post(f"{API_BASE}/chat/completions", json=base_req)
print(f"[{ts()}] Baseline: {r.status_code}")
for name, val in extra_params:
req = {**base_req, name: val}
r = c.post(f"{API_BASE}/chat/completions", json=req)
status = "" if r.status_code == 200 else ""
detail = ""
if r.status_code != 200:
try:
detail = r.json().get("error", {}).get("message", "")[:100]
except Exception:
detail = r.text[:100]
print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
if r.status_code != 200:
record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
def test_openclaw_tool_schema():
"""
Reproduce the exact tool schema that OpenClaw sends which has
parameters.properties = [] instead of {}. Middleware must fix it.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
print(f"{'='*60}")
# This is the exact shape OpenClaw sends for tools with no params
tools = [{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {
"type": "object",
"properties": [] # <-- THIS is what causes the 400
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Search for cats"}],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 128,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
print(f"[{ts()}] Success — middleware fixed the bad schema")
record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
# ── 8. OpenClaw full payload (chat_template_kwargs + tools) ─
def test_openclaw_full_payload():
"""
The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
Exactly what OpenClaw sends through the pipe.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web using DuckDuckGo.",
"parameters": {
"type": "object",
"properties": [] # Bad — middleware must fix
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Search for the weather in NYC"},
],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip
"logprobs": True, # Bad — middleware must strip
"top_logprobs": 5, # Bad — middleware must strip
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
msg = body["choices"][0]["message"]
print(f"[{ts()}] Success — middleware cleaned everything")
if msg.get("tool_calls"):
tc = msg["tool_calls"][0]
print(f"[{ts()}] Tool call: {tc['function']['name']}")
else:
print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
# ── Main ─────────────────────────────────────────────────────
def main():
print(f"\n{'='*60}")
print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
print(f"API: {API_BASE}")
print(f"Model: {MODEL}")
print(f"{'='*60}")
test_basic_nonstream()
test_basic_stream()
test_toolcall_nonstream()
test_toolcall_stream()
test_tool_response_flow()
test_param_sweep()
test_openclaw_tool_schema()
test_openclaw_full_payload()
print(f"\n\n{'='*60}")
print("FINAL RESULTS")
print(f"{'='*60}")
for r in RESULTS:
s = "" if r["pass"] else ""
print(f" {s} {r['name']}: {r['detail']}")
passed = sum(1 for r in RESULTS if r["pass"])
print(f"\n {passed}/{len(RESULTS)} passed")
print(f"{'='*60}")
if __name__ == "__main__":
main()