480 lines
17 KiB
Python
480 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
|
|
|
|
These tests send EXACTLY what OpenClaw would send to vLLM — including
|
|
chat_template_kwargs, logprobs, weird tool schemas, the works.
|
|
The middleware's job is to strip/fix all of it so SGLang doesn't choke.
|
|
|
|
Architecture: this test → middleware (strips bad params) → SGLang
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import json
|
|
import httpx
|
|
from datetime import datetime
|
|
|
|
# Point at the middleware, NOT SGLang directly
|
|
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
|
|
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
|
|
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
|
|
|
|
RESULTS = []
|
|
|
|
|
|
def ts():
|
|
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
|
|
|
|
def record(name, ok, detail=""):
|
|
status = "✓ PASS" if ok else "✗ FAIL"
|
|
print(f"\n{status}: {name}")
|
|
if detail:
|
|
print(f" {detail}")
|
|
RESULTS.append({"name": name, "pass": ok, "detail": detail})
|
|
|
|
|
|
def make_client():
|
|
return httpx.Client(
|
|
timeout=120.0,
|
|
headers={
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
)
|
|
|
|
|
|
# ── 1. Basic non-streaming chat ──────────────────────────────
|
|
|
|
def test_basic_nonstream():
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Basic non-streaming chat")
|
|
print(f"{'='*60}")
|
|
|
|
with make_client() as c:
|
|
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "Say hello in one word."}],
|
|
"stream": False,
|
|
"max_tokens": 32,
|
|
})
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
body = r.json()
|
|
if r.status_code != 200:
|
|
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
|
record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
return
|
|
content = body["choices"][0]["message"]["content"]
|
|
print(f"[{ts()}] Reply: {content[:100]}")
|
|
record("basic non-stream", True, f"Got: {content[:80]}")
|
|
|
|
|
|
# ── 2. Basic streaming chat ──────────────────────────────────
|
|
|
|
def test_basic_stream():
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Basic streaming chat")
|
|
print(f"{'='*60}")
|
|
|
|
with make_client() as c:
|
|
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
|
|
"stream": True,
|
|
"max_tokens": 64,
|
|
}) as r:
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
if r.status_code != 200:
|
|
body = "".join(r.iter_lines())
|
|
print(f"[{ts()}] Error: {body[:300]}")
|
|
record("basic stream", False, f"HTTP {r.status_code}")
|
|
return
|
|
full = ""
|
|
for line in r.iter_lines():
|
|
if not line or line == "data: [DONE]":
|
|
continue
|
|
if line.startswith("data: "):
|
|
try:
|
|
chunk = json.loads(line[6:])
|
|
if not chunk.get("choices"): continue
|
|
delta = chunk["choices"][0].get("delta", {})
|
|
if delta.get("content"):
|
|
full += delta["content"]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
print(f"[{ts()}] Reply: {full[:100]}")
|
|
record("basic stream", True, f"Got: {full[:80]}")
|
|
|
|
|
|
# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
|
|
|
|
def test_toolcall_nonstream():
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
|
|
print(f"{'='*60}")
|
|
|
|
tools = [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": "Get the current weather for a location",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
},
|
|
"required": ["location"]
|
|
}
|
|
}
|
|
}]
|
|
|
|
with make_client() as c:
|
|
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"stream": False,
|
|
"max_tokens": 256,
|
|
})
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
body = r.json()
|
|
if r.status_code != 200:
|
|
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
|
record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
return
|
|
msg = body["choices"][0]["message"]
|
|
if msg.get("tool_calls"):
|
|
tc = msg["tool_calls"][0]
|
|
print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
|
|
record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
|
|
else:
|
|
content = msg.get("content", "")
|
|
print(f"[{ts()}] No tool call. Content: {content[:200]}")
|
|
record("tool call non-stream", False, "Model did not call the tool")
|
|
|
|
|
|
# ── 4. Tool call — streaming ────────────────────────────────
|
|
|
|
def test_toolcall_stream():
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Tool call streaming")
|
|
print(f"{'='*60}")
|
|
|
|
tools = [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": "Get the current weather for a location",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
},
|
|
"required": ["location"]
|
|
}
|
|
}
|
|
}]
|
|
|
|
with make_client() as c:
|
|
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"stream": True,
|
|
"max_tokens": 256,
|
|
}) as r:
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
if r.status_code != 200:
|
|
body = "".join(r.iter_lines())
|
|
print(f"[{ts()}] Error: {body[:300]}")
|
|
record("tool call stream", False, f"HTTP {r.status_code}")
|
|
return
|
|
tool_name = None
|
|
accumulated_args = ""
|
|
content_parts = ""
|
|
for line in r.iter_lines():
|
|
if not line or line == "data: [DONE]":
|
|
continue
|
|
if line.startswith("data: "):
|
|
try:
|
|
chunk = json.loads(line[6:])
|
|
if not chunk.get("choices"): continue
|
|
delta = chunk["choices"][0].get("delta", {})
|
|
if delta.get("tool_calls"):
|
|
for tc in delta["tool_calls"]:
|
|
if tc.get("function", {}).get("name"):
|
|
tool_name = tc["function"]["name"]
|
|
if tc.get("function", {}).get("arguments"):
|
|
accumulated_args += tc["function"]["arguments"]
|
|
if delta.get("content"):
|
|
content_parts += delta["content"]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
if tool_name:
|
|
print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
|
|
record("tool call stream", True, f"Got tool call: {tool_name}")
|
|
else:
|
|
print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
|
|
record("tool call stream", False, "Model did not call the tool")
|
|
|
|
|
|
# ── 5. Full tool response flow (non-streaming) ──────────────
|
|
|
|
def test_tool_response_flow():
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
|
|
print(f"{'='*60}")
|
|
|
|
tools = [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": "Get the current weather for a location",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
},
|
|
"required": ["location"]
|
|
}
|
|
}
|
|
}]
|
|
|
|
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
|
|
|
|
with make_client() as c:
|
|
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": messages,
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"stream": False,
|
|
"max_tokens": 256,
|
|
})
|
|
body = r.json()
|
|
if r.status_code != 200:
|
|
record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
|
|
return
|
|
msg = body["choices"][0]["message"]
|
|
if not msg.get("tool_calls"):
|
|
record("tool response flow", False, "No tool call in step 1")
|
|
return
|
|
|
|
tc = msg["tool_calls"][0]
|
|
tc_id = tc["id"]
|
|
print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
|
|
|
|
messages.append(msg)
|
|
messages.append({
|
|
"role": "tool",
|
|
"tool_call_id": tc_id,
|
|
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
|
|
})
|
|
|
|
r2 = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": messages,
|
|
"tools": tools,
|
|
"stream": False,
|
|
"max_tokens": 256,
|
|
})
|
|
body2 = r2.json()
|
|
if r2.status_code != 200:
|
|
print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
|
|
record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
|
|
return
|
|
|
|
final = body2["choices"][0]["message"].get("content", "")
|
|
print(f"[{ts()}] Final: {final[:200]}")
|
|
ok = "22" in final
|
|
record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
|
|
|
|
|
|
# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
|
|
|
|
def test_param_sweep():
|
|
"""
|
|
Sends EVERY param that OpenClaw or vLLM might include.
|
|
The middleware must strip/fix the ones SGLang rejects.
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
|
|
print(f"{'='*60}")
|
|
|
|
base_req = {
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "Say hi."}],
|
|
"stream": False,
|
|
"max_tokens": 32,
|
|
}
|
|
|
|
# Params that OpenClaw/vLLM might send — some SGLang rejects
|
|
extra_params = [
|
|
("chat_template_kwargs", {"enable_thinking": False}),
|
|
("guided_json", None),
|
|
("guided_regex", None),
|
|
("response_format", {"type": "json_object"}),
|
|
("n", 1),
|
|
("presence_penalty", 0.0),
|
|
("frequency_penalty", 0.0),
|
|
("top_p", 1.0),
|
|
("temperature", 0.7),
|
|
("seed", 42),
|
|
("stop", ["\n"]),
|
|
("logprobs", True),
|
|
("top_logprobs", 5),
|
|
]
|
|
|
|
with make_client() as c:
|
|
# baseline
|
|
r = c.post(f"{API_BASE}/chat/completions", json=base_req)
|
|
print(f"[{ts()}] Baseline: {r.status_code}")
|
|
|
|
for name, val in extra_params:
|
|
req = {**base_req, name: val}
|
|
r = c.post(f"{API_BASE}/chat/completions", json=req)
|
|
status = "✓" if r.status_code == 200 else "✗"
|
|
detail = ""
|
|
if r.status_code != 200:
|
|
try:
|
|
detail = r.json().get("error", {}).get("message", "")[:100]
|
|
except Exception:
|
|
detail = r.text[:100]
|
|
print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
|
|
if r.status_code != 200:
|
|
record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
|
|
|
|
|
|
# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
|
|
|
|
def test_openclaw_tool_schema():
|
|
"""
|
|
Reproduce the exact tool schema that OpenClaw sends which has
|
|
parameters.properties = [] instead of {}. Middleware must fix it.
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
|
|
print(f"{'='*60}")
|
|
|
|
# This is the exact shape OpenClaw sends for tools with no params
|
|
tools = [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_search",
|
|
"description": "Search the web",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": [] # <-- THIS is what causes the 400
|
|
}
|
|
}
|
|
}]
|
|
|
|
with make_client() as c:
|
|
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [{"role": "user", "content": "Search for cats"}],
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"stream": False,
|
|
"max_tokens": 128,
|
|
})
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
body = r.json()
|
|
if r.status_code != 200:
|
|
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
|
record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
return
|
|
print(f"[{ts()}] Success — middleware fixed the bad schema")
|
|
record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
|
|
|
|
|
|
# ── 8. OpenClaw full payload (chat_template_kwargs + tools) ─
|
|
|
|
def test_openclaw_full_payload():
|
|
"""
|
|
The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
|
|
Exactly what OpenClaw sends through the pipe.
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
|
|
print(f"{'='*60}")
|
|
|
|
tools = [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_search",
|
|
"description": "Search the web using DuckDuckGo.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": [] # Bad — middleware must fix
|
|
}
|
|
}
|
|
}]
|
|
|
|
with make_client() as c:
|
|
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
"model": MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "Search for the weather in NYC"},
|
|
],
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"stream": False,
|
|
"max_tokens": 256,
|
|
"chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip
|
|
"logprobs": True, # Bad — middleware must strip
|
|
"top_logprobs": 5, # Bad — middleware must strip
|
|
})
|
|
print(f"[{ts()}] Status: {r.status_code}")
|
|
body = r.json()
|
|
if r.status_code != 200:
|
|
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
|
record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
return
|
|
msg = body["choices"][0]["message"]
|
|
print(f"[{ts()}] Success — middleware cleaned everything")
|
|
if msg.get("tool_calls"):
|
|
tc = msg["tool_calls"][0]
|
|
print(f"[{ts()}] Tool call: {tc['function']['name']}")
|
|
else:
|
|
print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
|
|
record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
|
|
|
|
|
|
# ── Main ─────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
print(f"\n{'='*60}")
|
|
print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
|
|
print(f"API: {API_BASE}")
|
|
print(f"Model: {MODEL}")
|
|
print(f"{'='*60}")
|
|
|
|
test_basic_nonstream()
|
|
test_basic_stream()
|
|
test_toolcall_nonstream()
|
|
test_toolcall_stream()
|
|
test_tool_response_flow()
|
|
test_param_sweep()
|
|
test_openclaw_tool_schema()
|
|
test_openclaw_full_payload()
|
|
|
|
print(f"\n\n{'='*60}")
|
|
print("FINAL RESULTS")
|
|
print(f"{'='*60}")
|
|
for r in RESULTS:
|
|
s = "✓" if r["pass"] else "✗"
|
|
print(f" {s} {r['name']}: {r['detail']}")
|
|
passed = sum(1 for r in RESULTS if r["pass"])
|
|
print(f"\n {passed}/{len(RESULTS)} passed")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|