consolidate to run_suite.py: single pluggable test suite, all models 84/84

This commit is contained in:
Jinx
2026-04-12 21:59:03 +00:00
parent 2fa811b2e2
commit 1beaa23c58
7 changed files with 826 additions and 1661 deletions

2
.gitignore vendored
View File

@@ -1 +1,3 @@
.env
models.env
__pycache__/

815
run_suite.py Normal file
View File

@@ -0,0 +1,815 @@
#!/usr/bin/env python3
"""
Universal model tool-call test suite.
Tests any OpenAI-compatible endpoint for:
1. Basic chat (non-streaming + streaming)
2. Tool calls (non-streaming + streaming)
3. Multi-turn tool response flow (non-streaming + streaming)
4. Nested/bad tool schema handling (SGLang compatibility)
5. Streaming tool call chunking (are args actually streamed?)
6. Param sweep (what vLLM params does the endpoint accept?)
Handles reasoning models (content in 'reasoning' field, null 'content'),
different finish_reason values, and empty/tool_calls arrays gracefully.
Usage:
TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py
python3 run_suite.py --all
python3 run_suite.py --model 1
python3 run_suite.py --filter Devstral
"""
import os
import sys
import json
import time
import httpx
import argparse
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field
# ── Helpers ──────────────────────────────────────────────────
def ts():
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
def safe_choice(body: dict, index: int = 0) -> dict:
"""Safely get a choice from a response body."""
choices = body.get("choices") or []
if index < len(choices):
return choices[index]
return {}
def safe_message(body: dict) -> dict:
"""Safely get the message from the first choice."""
return safe_choice(body).get("message") or {}
def safe_delta(chunk: dict) -> dict:
"""Safely get the delta from the first choice of a streaming chunk."""
choices = chunk.get("choices") or []
if choices:
return choices[0].get("delta") or {}
return {}
def extract_content(msg: dict) -> tuple[str, str]:
"""Extract (content, reasoning) from a message, handling nulls."""
content = msg.get("content") or ""
reasoning = msg.get("reasoning") or ""
return content, reasoning
# ── Config ───────────────────────────────────────────────────
@dataclass
class ModelConfig:
api_base: str
api_key: str
model: str
@property
def label(self):
return self.model.split("/")[-1]
def load_models_env(path: Path) -> list[ModelConfig]:
"""Load models from the models.env file (pipe-delimited)."""
configs = []
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) >= 3:
configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2]))
return configs
def config_from_env() -> ModelConfig | None:
"""Get a single config from TOOLTEST_* environment variables."""
base = os.environ.get("TOOLTEST_API_BASE")
key = os.environ.get("TOOLTEST_API_KEY")
model = os.environ.get("TOOLTEST_MODEL")
if base and key and model:
return ModelConfig(api_base=base, api_key=key, model=model)
return None
# ── Test result types ────────────────────────────────────────
@dataclass
class TestResult:
name: str
passed: bool
detail: str = ""
duration_s: float = 0.0
@dataclass
class SuiteResult:
model: str
results: list[TestResult] = field(default_factory=list)
@property
def passed(self):
return sum(1 for r in self.results if r.passed)
@property
def total(self):
return len(self.results)
def make_client(cfg: ModelConfig) -> httpx.Client:
return httpx.Client(
timeout=120.0,
headers={
"Authorization": f"Bearer {cfg.api_key}",
"Content-Type": "application/json",
},
)
# ── Shared tool definitions ──────────────────────────────────
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}
WRITE_FILE_TOOL = {
"type": "function",
"function": {
"name": "write_file",
"description": "Write content to a file.",
"parameters": {
"type": "object",
"properties": {
"filename": {"type": "string", "description": "Name of the file"},
"content": {"type": "string", "description": "The content to write"}
},
"required": ["filename", "content"]
}
}
}
BAD_SCHEMA_TOOL = {
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {
"type": "object",
"properties": [] # Invalid — should be {}
}
}
}
NESTED_BAD_SCHEMA_TOOL = {
"type": "function",
"function": {
"name": "message",
"description": "Send a message",
"parameters": {
"type": "object",
"properties": {
"fields": {
"type": "array",
"items": {
"type": "object",
"properties": [] # Invalid — should be {}
}
}
}
}
}
}
# ── Test functions ───────────────────────────────────────────
def test_basic_nonstream(cfg: ModelConfig) -> TestResult:
"""1. Basic non-streaming chat."""
with make_client(cfg) as c:
start = time.time()
try:
r = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "Say hello in one word."}],
"stream": False,
"max_tokens": 64,
})
body = r.json()
dur = time.time() - start
if r.status_code != 200:
return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
content, reasoning = extract_content(safe_message(body))
fr = safe_choice(body).get("finish_reason", "?")
if content:
return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur)
elif reasoning:
return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur)
else:
return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur)
except Exception as e:
return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start)
def test_basic_stream(cfg: ModelConfig) -> TestResult:
"""2. Basic streaming chat."""
with make_client(cfg) as c:
start = time.time()
try:
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
"stream": True,
"max_tokens": 64,
}) as r:
if r.status_code != 200:
body = "".join(r.iter_lines())
dur = time.time() - start
return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur)
full_content = ""
full_reasoning = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
delta = safe_delta(chunk)
if delta.get("content"):
full_content += delta["content"]
if delta.get("reasoning"):
full_reasoning += delta["reasoning"]
except json.JSONDecodeError:
pass
dur = time.time() - start
if full_content:
return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur)
elif full_reasoning:
return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur)
else:
return TestResult("basic stream", False, "No content or reasoning received", dur)
except Exception as e:
return TestResult("basic stream", False, f"Exception: {e}", time.time() - start)
def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult:
"""3. Tool call — non-streaming."""
with make_client(cfg) as c:
start = time.time()
try:
r = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": [WEATHER_TOOL],
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
body = r.json()
dur = time.time() - start
if r.status_code != 200:
return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
msg = safe_message(body)
tool_calls = msg.get("tool_calls") or []
if tool_calls:
tc = tool_calls[0]
fn = tc.get("function", {})
return TestResult("tool call non-stream", True,
f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur)
else:
content, reasoning = extract_content(msg)
out = content or reasoning or "(empty)"
return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur)
except Exception as e:
return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start)
def test_toolcall_stream(cfg: ModelConfig) -> TestResult:
"""4. Tool call — streaming."""
with make_client(cfg) as c:
start = time.time()
try:
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": [WEATHER_TOOL],
"tool_choice": "auto",
"stream": True,
"max_tokens": 256,
}) as r:
if r.status_code != 200:
body = "".join(r.iter_lines())
dur = time.time() - start
return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur)
tool_name = None
accumulated_args = ""
content_parts = ""
reasoning_parts = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
delta = safe_delta(chunk)
tc_list = delta.get("tool_calls") or []
for tc in tc_list:
fn = tc.get("function") or {}
if fn.get("name"):
tool_name = fn["name"]
if fn.get("arguments"):
accumulated_args += fn["arguments"]
if delta.get("content"):
content_parts += delta["content"]
if delta.get("reasoning"):
reasoning_parts += delta["reasoning"]
except json.JSONDecodeError:
pass
dur = time.time() - start
if tool_name:
return TestResult("tool call stream", True,
f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur)
else:
out = content_parts or reasoning_parts or "(empty)"
return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur)
except Exception as e:
return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start)
def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult:
"""5/6. Full tool call → response → follow-up flow."""
label = "tool response flow (stream)" if streaming else "tool response flow"
with make_client(cfg) as c:
start = time.time()
try:
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
# Step 1: Get tool call
if not streaming:
r = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": messages,
"tools": [WEATHER_TOOL],
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
body = r.json()
if r.status_code != 200:
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
msg = safe_message(body)
else:
tool_name = None
tool_id = None
accumulated_args = ""
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": messages,
"tools": [WEATHER_TOOL],
"tool_choice": "auto",
"stream": True,
"max_tokens": 256,
}) as r:
if r.status_code != 200:
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
delta = safe_delta(chunk)
for tc in (delta.get("tool_calls") or []):
if tc.get("id"):
tool_id = tc["id"]
fn = tc.get("function") or {}
if fn.get("name"):
tool_name = fn["name"]
if fn.get("arguments"):
accumulated_args += fn["arguments"]
except json.JSONDecodeError:
pass
if not tool_name:
return TestResult(label, False, "No tool call in step 1", time.time() - start)
msg = {
"role": "assistant",
"tool_calls": [{
"id": tool_id or "call_0",
"type": "function",
"function": {"name": tool_name, "arguments": accumulated_args}
}]
}
tool_calls = msg.get("tool_calls") or []
if not tool_calls:
return TestResult(label, False, "No tool call in step 1", time.time() - start)
tc = tool_calls[0]
tc_id = tc.get("id", "call_0")
# Step 2: Send tool response
messages.append(msg)
messages.append({
"role": "tool",
"tool_call_id": tc_id,
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
})
# Step 3: Get follow-up
r2 = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": messages,
"tools": [WEATHER_TOOL],
"stream": False,
"max_tokens": 256,
})
body2 = r2.json()
dur = time.time() - start
if r2.status_code != 200:
return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur)
final_msg = safe_message(body2)
final_content, final_reasoning = extract_content(final_msg)
final = final_content or final_reasoning or ""
# Check the model actually used the tool data
ok = "22" in final
indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"]
for ind in indicators:
if ind in final.lower():
ok = False
break
if not final_content and final_reasoning:
return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur)
return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur)
except Exception as e:
return TestResult(label, False, f"Exception: {e}", time.time() - start)
def test_bad_tool_schema(cfg: ModelConfig) -> TestResult:
"""7. OpenClaw-style tool with properties=[] (tests schema validation/middleware)."""
with make_client(cfg) as c:
start = time.time()
try:
r = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "Search for cats"}],
"tools": [BAD_SCHEMA_TOOL],
"tool_choice": "auto",
"stream": False,
"max_tokens": 128,
})
body = r.json()
dur = time.time() - start
if r.status_code != 200:
err = ""
try:
err = body.get("error", {}).get("message", "")[:150]
except Exception:
err = json.dumps(body)[:150]
return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur)
except Exception as e:
return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start)
def test_nested_bad_schema(cfg: ModelConfig) -> TestResult:
"""8. Nested properties=[] inside items (the Tool 21 bug)."""
with make_client(cfg) as c:
start = time.time()
try:
r = c.post(f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{"role": "user", "content": "Send a message to Bob"}],
"tools": [NESTED_BAD_SCHEMA_TOOL],
"tool_choice": "auto",
"stream": False,
"max_tokens": 128,
})
body = r.json()
dur = time.time() - start
if r.status_code != 200:
err = ""
try:
err = body.get("error", {}).get("message", "")[:150]
except Exception:
err = json.dumps(body)[:150]
return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur)
except Exception as e:
return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start)
def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult:
"""9. Streaming tool call chunking — are args actually streamed in multiple chunks?"""
with make_client(cfg) as c:
start = time.time()
try:
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
"model": cfg.model,
"messages": [{
"role": "user",
"content": "Write a Python hello world and save it using the write_file tool."
}],
"tools": [WRITE_FILE_TOOL],
"tool_choice": "auto",
"stream": True,
"max_tokens": 1024,
}) as r:
if r.status_code != 200:
dur = time.time() - start
return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur)
tool_name = None
arg_chunks = 0
accumulated_args = ""
content_chunks = 0
reasoning_chunks = 0
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
delta = safe_delta(chunk)
for tc in (delta.get("tool_calls") or []):
fn = tc.get("function") or {}
if fn.get("name"):
tool_name = fn["name"]
if fn.get("arguments"):
arg_chunks += 1
accumulated_args += fn["arguments"]
if delta.get("content"):
content_chunks += 1
if delta.get("reasoning"):
reasoning_chunks += 1
except json.JSONDecodeError:
pass
dur = time.time() - start
if not tool_name:
if content_chunks > 0 or reasoning_chunks > 0:
return TestResult("streaming tool chunking", False,
f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur)
return TestResult("streaming tool chunking", False, "No tool call and no content", dur)
# Evaluate chunking quality
if arg_chunks > 1:
return TestResult("streaming tool chunking", True,
f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur)
elif arg_chunks == 1 and len(accumulated_args) > 500:
return TestResult("streaming tool chunking", False,
f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur)
elif arg_chunks == 1:
return TestResult("streaming tool chunking", True,
f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur)
else:
return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur)
except Exception as e:
return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start)
def test_param_sweep(cfg: ModelConfig) -> list[TestResult]:
"""10. Parameter sweep — which vLLM params does the endpoint accept?"""
results = []
base_req = {
"model": cfg.model,
"messages": [{"role": "user", "content": "Say hi."}],
"stream": False,
"max_tokens": 32,
}
extra_params = [
("chat_template_kwargs", {"enable_thinking": False}),
("guided_json", None),
("guided_regex", None),
("response_format", {"type": "json_object"}),
("n", 1),
("presence_penalty", 0.0),
("frequency_penalty", 0.0),
("top_p", 1.0),
("temperature", 0.7),
("seed", 42),
("stop", ["\n"]),
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
]
with make_client(cfg) as c:
for name, val in extra_params:
start = time.time()
try:
if isinstance(val, dict):
req = {**base_req, **val}
else:
req = {**base_req, name: val}
r = c.post(f"{cfg.api_base}/chat/completions", json=req)
dur = time.time() - start
ok = r.status_code == 200
detail = f"HTTP {r.status_code}"
if not ok:
try:
detail += f": {r.json().get('error', {}).get('message', '')[:80]}"
except Exception:
pass
results.append(TestResult(f"param: {name}", ok, detail, dur))
except Exception as e:
results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start))
return results
# ── Suite runner ─────────────────────────────────────────────
ALL_TESTS = [
test_basic_nonstream,
test_basic_stream,
test_toolcall_nonstream,
test_toolcall_stream,
lambda cfg: test_tool_response_flow(cfg, streaming=False),
lambda cfg: test_tool_response_flow(cfg, streaming=True),
test_bad_tool_schema,
test_nested_bad_schema,
test_streaming_tool_chunks,
]
def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult:
"""Run the full test suite against one model config."""
result = SuiteResult(model=cfg.model)
print(f"\n{'='*60}")
print(f"Testing: {cfg.model}")
print(f"API: {cfg.api_base}")
print(f"{'='*60}")
for test_fn in ALL_TESTS:
name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__
if verbose:
print(f"\n[{ts()}] Running: {name}...")
tr = test_fn(cfg)
if isinstance(tr, list):
result.results.extend(tr)
else:
result.results.append(tr)
if verbose:
if isinstance(tr, list):
for r in tr:
s = "" if r.passed else ""
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
else:
s = "" if tr.passed else ""
print(f" {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)")
# Param sweep
if verbose:
print(f"\n[{ts()}] Running: parameter sweep...")
sweep_results = test_param_sweep(cfg)
result.results.extend(sweep_results)
if verbose:
for r in sweep_results:
s = "" if r.passed else ""
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
return result
def print_summary(results: list[SuiteResult]):
"""Print a final summary across all models."""
print(f"\n\n{'='*60}")
print("FINAL SUMMARY")
print(f"{'='*60}")
for sr in results:
passed = sr.passed
total = sr.total
pct = (passed / total * 100) if total else 0
label = sr.model.split("/")[-1]
print(f"\n {label}: {passed}/{total} passed ({pct:.0f}%)")
for r in sr.results:
if not r.passed:
print(f"{r.name}: {r.detail[:80]}")
# Cross-model comparison for key tests
print(f"\n{''*60}")
print("CROSS-MODEL COMPARISON")
print(f"{''*60}")
key_tests = [
"basic non-stream",
"basic stream",
"tool call non-stream",
"tool call stream",
"tool response flow",
"tool response flow (stream)",
"streaming tool chunking",
"bad tool schema (properties=[])",
"nested bad schema (items.properties=[])",
]
# Calculate column width
labels = [sr.model.split("/")[-1][:18] for sr in results]
col_w = max(len(l) for l in labels) if labels else 16
col_w = max(col_w, 16)
header = f"{'Test':<40}"
for l in labels:
header += f" {l:>{col_w}}"
print(header)
print("" * len(header))
for test_name in key_tests:
row = f"{test_name:<40}"
for sr in results:
match = [r for r in sr.results if r.name == test_name]
if match:
status = "" if match[0].passed else ""
row += f" {status:>{col_w}}"
else:
row += f" {'':>{col_w}}"
print(row)
print(f"\n{'='*60}")
# ── CLI ──────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Universal model tool-call test suite")
parser.add_argument("--all", action="store_true", help="Test all models from models.env")
parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env")
parser.add_argument("--filter", type=str, help="Test models matching substring")
parser.add_argument("--quiet", action="store_true", help="Less output per test")
args = parser.parse_args()
models_path = Path(__file__).parent / "models.env"
configs: list[ModelConfig] = []
if args.all:
if not models_path.exists():
print("ERROR: models.env not found")
sys.exit(1)
configs = load_models_env(models_path)
elif args.model:
if not models_path.exists():
print("ERROR: models.env not found")
sys.exit(1)
all_configs = load_models_env(models_path)
if args.model < 1 or args.model > len(all_configs):
print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})")
sys.exit(1)
configs = [all_configs[args.model - 1]]
elif args.filter:
if not models_path.exists():
print("ERROR: models.env not found")
sys.exit(1)
all_configs = load_models_env(models_path)
configs = [c for c in all_configs if args.filter.lower() in c.model.lower()]
if not configs:
print(f"No models matching '{args.filter}'")
sys.exit(1)
else:
cfg = config_from_env()
if cfg:
configs = [cfg]
else:
print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.")
if models_path.exists():
print("\nAvailable models from models.env:")
for i, c in enumerate(load_models_env(models_path), 1):
print(f" {i}. {c.model} @ {c.api_base}")
sys.exit(1)
all_results: list[SuiteResult] = []
for cfg in configs:
sr = run_suite(cfg, verbose=not args.quiet)
all_results.append(sr)
print_summary(all_results)
if any(sr.passed < sr.total for sr in all_results):
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,19 +1,14 @@
#!/bin/bash
# Run the streaming tool call tests
#!/usr/bin/env bash
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Default values
export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}"
export VLLM_API_KEY="${VLLM_API_KEY:-none}"
export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}"
# Usage:
# ./run_tests.sh # Test all models from models.env
# ./run_tests.sh --model 1 # Test model #1
# ./run_tests.sh --filter Devstral # Test matching models
# ./run_tests.sh --all # Same as no args
# ./run_tests.sh --quiet # Less output
echo "Configuration:"
echo " API_BASE: $VLLM_API_BASE"
echo " MODEL: $VLLM_MODEL"
echo ""
# Run the test
python3 "$SCRIPT_DIR/test_streaming_tool_calls.py"
cd "$SCRIPT_DIR"
python3 -u run_suite.py "$@"

View File

@@ -1,546 +0,0 @@
#!/usr/bin/env python3
"""
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
These tests send EXACTLY what OpenClaw would send to vLLM — including
chat_template_kwargs, logprobs, weird tool schemas, the works.
The middleware's job is to strip/fix all of it so SGLang doesn't choke.
Architecture: this test → middleware (strips bad params) → SGLang
"""
import os
import time
import json
import httpx
from datetime import datetime
from pathlib import Path
# Load .env if present (don't hardcode keys)
_env_file = Path(__file__).parent / ".env"
if _env_file.exists():
for line in _env_file.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip())
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
RESULTS = []
def ts():
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
def record(name, ok, detail=""):
status = "✓ PASS" if ok else "✗ FAIL"
print(f"\n{status}: {name}")
if detail:
print(f" {detail}")
RESULTS.append({"name": name, "pass": ok, "detail": detail})
def make_client():
return httpx.Client(
timeout=120.0,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
},
)
# ── 1. Basic non-streaming chat ──────────────────────────────
def test_basic_nonstream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Basic non-streaming chat")
print(f"{'='*60}")
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Say hello in one word."}],
"stream": False,
"max_tokens": 32,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
content = body["choices"][0]["message"]["content"]
print(f"[{ts()}] Reply: {content[:100]}")
record("basic non-stream", True, f"Got: {content[:80]}")
# ── 2. Basic streaming chat ──────────────────────────────────
def test_basic_stream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Basic streaming chat")
print(f"{'='*60}")
with make_client() as c:
with c.stream("POST", f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
"stream": True,
"max_tokens": 64,
}) as r:
print(f"[{ts()}] Status: {r.status_code}")
if r.status_code != 200:
body = "".join(r.iter_lines())
print(f"[{ts()}] Error: {body[:300]}")
record("basic stream", False, f"HTTP {r.status_code}")
return
full = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if not chunk.get("choices"): continue
delta = chunk["choices"][0].get("delta", {})
if delta.get("content"):
full += delta["content"]
except json.JSONDecodeError:
pass
print(f"[{ts()}] Reply: {full[:100]}")
record("basic stream", True, f"Got: {full[:80]}")
# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
def test_toolcall_nonstream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
msg = body["choices"][0]["message"]
if msg.get("tool_calls"):
tc = msg["tool_calls"][0]
print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
else:
content = msg.get("content", "")
print(f"[{ts()}] No tool call. Content: {content[:200]}")
record("tool call non-stream", False, "Model did not call the tool")
# ── 4. Tool call — streaming ────────────────────────────────
def test_toolcall_stream():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Tool call streaming")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
with make_client() as c:
with c.stream("POST", f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
"tools": tools,
"tool_choice": "auto",
"stream": True,
"max_tokens": 256,
}) as r:
print(f"[{ts()}] Status: {r.status_code}")
if r.status_code != 200:
body = "".join(r.iter_lines())
print(f"[{ts()}] Error: {body[:300]}")
record("tool call stream", False, f"HTTP {r.status_code}")
return
tool_name = None
accumulated_args = ""
content_parts = ""
for line in r.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if not chunk.get("choices"): continue
delta = chunk["choices"][0].get("delta", {})
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
if tc.get("function", {}).get("name"):
tool_name = tc["function"]["name"]
if tc.get("function", {}).get("arguments"):
accumulated_args += tc["function"]["arguments"]
if delta.get("content"):
content_parts += delta["content"]
except json.JSONDecodeError:
pass
if tool_name:
print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
record("tool call stream", True, f"Got tool call: {tool_name}")
else:
print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
record("tool call stream", False, "Model did not call the tool")
# ── 5. Full tool response flow (non-streaming) ──────────────
def test_tool_response_flow():
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
},
"required": ["location"]
}
}
}]
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
})
body = r.json()
if r.status_code != 200:
record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
return
msg = body["choices"][0]["message"]
if not msg.get("tool_calls"):
record("tool response flow", False, "No tool call in step 1")
return
tc = msg["tool_calls"][0]
tc_id = tc["id"]
print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
messages.append(msg)
messages.append({
"role": "tool",
"tool_call_id": tc_id,
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
})
r2 = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": False,
"max_tokens": 256,
})
body2 = r2.json()
if r2.status_code != 200:
print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
return
final = body2["choices"][0]["message"].get("content", "")
print(f"[{ts()}] Final: {final[:200]}")
ok = "22" in final
record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'}{final[:100]}")
# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
def test_param_sweep():
"""
Sends EVERY param that OpenClaw or vLLM might include.
The middleware must strip/fix the ones SGLang rejects.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
print(f"{'='*60}")
base_req = {
"model": MODEL,
"messages": [{"role": "user", "content": "Say hi."}],
"stream": False,
"max_tokens": 32,
}
# Params that OpenClaw/vLLM might send — some SGLang rejects
extra_params = [
("chat_template_kwargs", {"enable_thinking": False}),
("guided_json", None),
("guided_regex", None),
("response_format", {"type": "json_object"}),
("n", 1),
("presence_penalty", 0.0),
("frequency_penalty", 0.0),
("top_p", 1.0),
("temperature", 0.7),
("seed", 42),
("stop", ["\n"]),
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
("top_logprobs", 5),
]
with make_client() as c:
# baseline
r = c.post(f"{API_BASE}/chat/completions", json=base_req)
print(f"[{ts()}] Baseline: {r.status_code}")
for name, val in extra_params:
req = {**base_req, name: val}
r = c.post(f"{API_BASE}/chat/completions", json=req)
status = "" if r.status_code == 200 else ""
detail = ""
if r.status_code != 200:
try:
detail = r.json().get("error", {}).get("message", "")[:100]
except Exception:
detail = r.text[:100]
print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
if r.status_code != 200:
record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
def test_openclaw_tool_schema():
"""
Reproduce the exact tool schema that OpenClaw sends which has
parameters.properties = [] instead of {}. Middleware must fix it.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
print(f"{'='*60}")
# This is the exact shape OpenClaw sends for tools with no params
tools = [{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {
"type": "object",
"properties": [] # <-- THIS is what causes the 400
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Search for cats"}],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 128,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
print(f"[{ts()}] Success — middleware fixed the bad schema")
record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
# ── 8. Nested properties=[] in tool schema (Tool 21 bug) ────
def test_nested_bad_properties():
"""
Reproduce the exact Tool 21 400 error:
schema['properties']['fields']['items']['properties'] = []
This happens when a tool has an array-of-objects parameter where
the items' properties field is [] instead of {}. The middleware
must recurse into the schema to fix ALL properties fields.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)")
print(f"{'='*60}")
# This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema:
# [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']"
tools = [{
"type": "function",
"function": {
"name": "message",
"description": "Send a message",
"parameters": {
"type": "object",
"properties": {
"fields": {
"type": "array",
"items": {
"type": "object",
"properties": [] # <-- THIS causes the 400
}
}
}
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [{"role": "user", "content": "Send a message to Bob"}],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 128,
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}")
record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}")
record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}")
# ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─
def test_openclaw_full_payload():
"""
The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
Exactly what OpenClaw sends through the pipe.
"""
print(f"\n{'='*60}")
print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
print(f"{'='*60}")
tools = [{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web using DuckDuckGo.",
"parameters": {
"type": "object",
"properties": [] # Bad — middleware must fix
}
}
}]
with make_client() as c:
r = c.post(f"{API_BASE}/chat/completions", json={
"model": MODEL,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Search for the weather in NYC"},
],
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip
"logprobs": True, # Bad — middleware must strip
"top_logprobs": 5, # Bad — middleware must strip
})
print(f"[{ts()}] Status: {r.status_code}")
body = r.json()
if r.status_code != 200:
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
return
msg = body["choices"][0]["message"]
print(f"[{ts()}] Success — middleware cleaned everything")
if msg.get("tool_calls"):
tc = msg["tool_calls"][0]
print(f"[{ts()}] Tool call: {tc['function']['name']}")
else:
print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
# ── Main ─────────────────────────────────────────────────────
def main():
print(f"\n{'='*60}")
print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
print(f"API: {API_BASE}")
print(f"Model: {MODEL}")
print(f"{'='*60}")
test_basic_nonstream()
test_basic_stream()
test_toolcall_nonstream()
test_toolcall_stream()
test_tool_response_flow()
test_param_sweep()
test_openclaw_tool_schema()
test_nested_bad_properties()
test_openclaw_full_payload()
print(f"\n\n{'='*60}")
print("FINAL RESULTS")
print(f"{'='*60}")
for r in RESULTS:
s = "" if r["pass"] else ""
print(f" {s} {r['name']}: {r['detail']}")
passed = sum(1 for r in RESULTS if r["pass"])
print(f"\n {passed}/{len(RESULTS)} passed")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@@ -1,395 +0,0 @@
#!/usr/bin/env python3
"""
Test suite for vLLM GLM-5.1 streaming tool calls.
Reproduces the issue where long string parameters in tool calls
are buffered entirely before being emitted during streaming.
"""
import os
import time
import json
import httpx
from datetime import datetime
# Configuration - will be set via environment or direct assignment
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
API_KEY = os.environ.get("VLLM_API_KEY", "none")
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
def timestamp():
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
def test_streaming_tool_call_with_code():
"""
Test streaming a tool call with a long string parameter.
This prompts the model to generate code via a tool call,
which should stream incrementally if the patch works correctly.
"""
tools = [
{
"type": "function",
"function": {
"name": "write_file",
"description": "Write content to a file. Use this to save code, text, or other content.",
"parameters": {
"type": "object",
"properties": {
"filename": {
"type": "string",
"description": "Name of the file to write"
},
"content": {
"type": "string",
"description": "The content to write to the file"
}
},
"required": ["filename", "content"]
}
}
}
]
messages = [
{
"role": "user",
"content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool."
}
]
print(f"\n{'='*60}")
print(f"TEST: Streaming tool call with long string parameter")
print(f"API: {API_BASE}")
print(f"Model: {MODEL}")
print(f"{'='*60}\n")
# Track streaming events
chunks_received = []
first_chunk_time = None
last_chunk_time = None
tool_call_chunks = []
accumulated_content = ""
start_time = time.time()
with httpx.Client(timeout=120.0) as client:
with client.stream(
"POST",
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": True,
"max_tokens": 4096,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
) as response:
print(f"[{timestamp()}] Response status: {response.status_code}")
for line in response.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
chunk_data = line[6:]
try:
chunk = json.loads(chunk_data)
if first_chunk_time is None:
first_chunk_time = time.time()
print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)")
last_chunk_time = time.time()
chunks_received.append(chunk)
# Extract delta content
if chunk.get("choices"):
delta = chunk["choices"][0].get("delta", {})
# Check for tool calls in delta
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
tc_index = tc.get("index", 0)
tc_function = tc.get("function", {})
if tc_function.get("name"):
print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}")
if tc_function.get("arguments"):
args_chunk = tc_function["arguments"]
tool_call_chunks.append(args_chunk)
accumulated_content += args_chunk
# Print progress every ~500 chars
if len(accumulated_content) % 500 < len(args_chunk):
print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...")
# Regular content
if delta.get("content"):
print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...")
except json.JSONDecodeError as e:
print(f"[{timestamp()}] JSON decode error: {e}")
end_time = time.time()
# Summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"Total chunks received: {len(chunks_received)}")
print(f"Total time: {end_time - start_time:.3f}s")
if first_chunk_time:
print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s")
if tool_call_chunks:
print(f"Tool call chunks: {len(tool_call_chunks)}")
print(f"Total tool call content: {len(accumulated_content)} chars")
# Try to parse the accumulated arguments
print(f"\nAttempting to parse tool call arguments...")
try:
args = json.loads(accumulated_content)
print(f"Successfully parsed!")
print(f" - filename: {args.get('filename', 'N/A')}")
print(f" - content length: {len(args.get('content', ''))} chars")
except json.JSONDecodeError as e:
print(f"Failed to parse: {e}")
print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}")
# Verdict
print(f"\n{'='*60}")
if len(tool_call_chunks) > 1:
print("✓ PASS: Tool call arguments arrived in multiple chunks")
print(f" Chunks: {len(tool_call_chunks)}, indicating incremental streaming")
elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000:
print("✗ FAIL: Tool call arguments arrived in a single chunk")
print(" This indicates buffering, not true streaming")
else:
print("? INCONCLUSIVE: Not enough data or no tool call occurred")
print(f"{'='*60}\n")
return {
"chunks_received": len(chunks_received),
"tool_call_chunks": len(tool_call_chunks),
"accumulated_length": len(accumulated_content),
"total_time": end_time - start_time
}
def test_streaming_tool_call_with_json():
"""
Test streaming a tool call that returns structured JSON data.
"""
tools = [
{
"type": "function",
"function": {
"name": "save_config",
"description": "Save a configuration object",
"parameters": {
"type": "object",
"properties": {
"config": {
"type": "object",
"description": "Configuration object with many fields"
}
},
"required": ["config"]
}
}
}
]
messages = [
{
"role": "user",
"content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool."
}
]
print(f"\n{'='*60}")
print(f"TEST: Streaming tool call with nested JSON")
print(f"{'='*60}\n")
tool_call_chunks = []
accumulated_content = ""
start_time = time.time()
with httpx.Client(timeout=120.0) as client:
with client.stream(
"POST",
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": True,
"max_tokens": 2048,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
) as response:
for line in response.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if chunk.get("choices"):
delta = chunk["choices"][0].get("delta", {})
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
if tc.get("function", {}).get("arguments"):
args_chunk = tc["function"]["arguments"]
tool_call_chunks.append(args_chunk)
accumulated_content += args_chunk
print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})")
except json.JSONDecodeError:
pass
end_time = time.time()
print(f"\n{'='*60}")
print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars")
print(f"Time: {end_time - start_time:.3f}s")
if len(tool_call_chunks) > 1:
print("✓ PASS: Arguments streamed in multiple chunks")
elif len(tool_call_chunks) == 1:
print("✗ FAIL: Arguments arrived in single chunk (buffered)")
else:
print("? No tool call occurred")
print(f"{'='*60}\n")
def test_non_streaming_tool_call():
"""
Baseline test: non-streaming tool call for comparison.
"""
tools = [
{
"type": "function",
"function": {
"name": "write_file",
"description": "Write content to a file",
"parameters": {
"type": "object",
"properties": {
"filename": {"type": "string"},
"content": {"type": "string"}
},
"required": ["filename", "content"]
}
}
}
]
messages = [
{
"role": "user",
"content": "Write a simple Python hello world and save it using the write_file tool."
}
]
print(f"\n{'='*60}")
print(f"TEST: Non-streaming tool call (baseline)")
print(f"{'='*60}\n")
start_time = time.time()
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 1024,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
end_time = time.time()
print(f"Status: {response.status_code}")
print(f"Time: {end_time - start_time:.3f}s")
if result.get("choices"):
message = result["choices"][0].get("message", {})
if message.get("tool_calls"):
for tc in message["tool_calls"]:
print(f"Tool: {tc['function']['name']}")
args = json.loads(tc["function"]["arguments"])
print(f"Arguments parsed successfully")
print(f" - filename: {args.get('filename')}")
print(f" - content length: {len(args.get('content', ''))}")
else:
print("No tool call in response")
print(f"{'='*60}\n")
def main():
print("\n" + "="*60)
print("vLLM GLM-5.1 Streaming Tool Call Tests")
print("="*60)
# Check API connectivity
print(f"\nChecking API at {API_BASE}...")
try:
with httpx.Client(timeout=10.0) as client:
response = client.get(f"{API_BASE.replace('/v1', '')}/health")
print(f"Health check: {response.status_code}")
except Exception as e:
print(f"Warning: Could not reach API - {e}")
# Run tests
print("\nRunning tests...\n")
# Test 1: Non-streaming baseline
test_non_streaming_tool_call()
# Test 2: Streaming with nested JSON
test_streaming_tool_call_with_json()
# Test 3: Main test - streaming with long code
result = test_streaming_tool_call_with_code()
print("\nAll tests complete.")
if __name__ == "__main__":
main()

View File

@@ -1,243 +0,0 @@
#!/usr/bin/env python3
"""
Focused test to diagnose GLM-5.1 tool response issue.
The issue: Model sees tool response as blank.
"""
import httpx
import json
API_BASE = "http://95.179.247.150/v1"
API_KEY = "whatever"
MODEL = "HuggingFaceTB/SmolLM3-3B"
def test_simple_tool_response():
"""
Minimal test: Send a tool response and see if the model can use it.
"""
# Simulate a conversation where a tool was called
messages = [
{"role": "user", "content": "Call the test function"},
{
"role": "assistant",
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {"name": "test_func", "arguments": "{}"}
}]
},
{
"role": "tool",
"tool_call_id": "call_123",
"content": "SUCCESS: The function returned value 42"
}
]
tools = [{
"type": "function",
"function": {
"name": "test_func",
"description": "A test function",
"parameters": {"type": "object", "properties": {}}
}
}]
print("=" * 60)
print("Request messages:")
print(json.dumps(messages, indent=2))
print("=" * 60)
with httpx.Client(timeout=60.0) as client:
# Non-streaming to get full response
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
print("\nFull response:")
print(json.dumps(result, indent=2))
if result.get("choices"):
content = result["choices"][0].get("message", {}).get("content", "")
print("\n" + "=" * 60)
print("Model response content:")
print(content)
print("=" * 60)
# Check if the tool result is referenced
if "42" in content:
print("\n✓ PASS: Model referenced the tool result (42)")
else:
print("\n✗ FAIL: Model did NOT reference the tool result (42)")
# Check for signs the model didn't see the result
if "don't have" in content.lower() or "cannot access" in content.lower():
print("✗ Model indicates it cannot see tool result")
def test_without_tools_param():
"""
Test what happens if we don't pass tools in the follow-up request.
Some APIs need tools to be passed on every request.
"""
messages = [
{"role": "user", "content": "Call the test function"},
{
"role": "assistant",
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {"name": "test_func", "arguments": "{}"}
}]
},
{
"role": "tool",
"tool_call_id": "call_123",
"content": "SUCCESS: The function returned value 42"
}
]
print("\n" + "=" * 60)
print("Test WITHOUT tools param in follow-up")
print("=" * 60)
with httpx.Client(timeout=60.0) as client:
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
# No tools param
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
if result.get("choices"):
content = result["choices"][0].get("message", {}).get("content", "")
print("Model response:", content[:200])
if "42" in content:
print("✓ Model referenced the tool result")
def test_different_content_formats():
"""
Test if the issue is with how content is formatted.
"""
# Test 1: String content (standard)
messages_string = [
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {"name": "calc", "arguments": "{}"}
}]
},
{
"role": "tool",
"tool_call_id": "call_123",
"content": "The answer is 4"
}
]
# Test 2: Content as array (OpenAI format)
messages_array = [
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {"name": "calc", "arguments": "{}"}
}]
},
{
"role": "tool",
"tool_call_id": "call_123",
"content": [{"type": "text", "text": "The answer is 4"}]
}
]
tools = [{
"type": "function",
"function": {
"name": "calc",
"description": "Calculator",
"parameters": {"type": "object", "properties": {}}
}
}]
print("\n" + "=" * 60)
print("Test: String content vs Array content")
print("=" * 60)
with httpx.Client(timeout=60.0) as client:
for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
print(f"\n--- {name} ---")
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": msgs,
"tools": tools,
"stream": False,
"max_tokens": 128,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
if result.get("choices"):
content = result["choices"][0].get("message", {}).get("content", "")
print(f"Response: {content[:150]}")
if "4" in content:
print("✓ Referenced tool result")
else:
print("✗ Did NOT reference tool result")
if __name__ == "__main__":
print("GLM-5.1 Tool Response Diagnosis")
print("=" * 60)
test_simple_tool_response()
test_without_tools_param()
test_different_content_formats()

View File

@@ -1,463 +0,0 @@
#!/usr/bin/env python3
"""
Test for tool call response handling in GLM-5.1.
Tests the multi-turn flow:
1. Send a prompt that triggers a tool call
2. Send back the tool result
3. Verify the model can see and use the tool response
This reproduces the issue where tool responses appear blank to the model.
"""
import os
import json
import httpx
from datetime import datetime
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
API_KEY = os.environ.get("VLLM_API_KEY", "none")
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
def timestamp():
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
def test_tool_call_response_flow(streaming: bool = True):
"""
Test the full tool call -> response -> follow-up flow.
This simulates:
1. User asks for weather
2. Model calls get_weather tool
3. We send back the weather data
4. Model should see and use that data
"""
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g. 'New York, NY'"
}
},
"required": ["location"]
}
}
}
]
# Initial request that should trigger a tool call
messages = [
{
"role": "user",
"content": "What's the weather like in Tokyo right now?"
}
]
mode = "STREAMING" if streaming else "NON-STREAMING"
print(f"\n{'='*60}")
print(f"TEST: Tool call response flow ({mode})")
print(f"API: {API_BASE}")
print(f"Model: {MODEL}")
print(f"{'='*60}\n")
with httpx.Client(timeout=120.0) as client:
# Step 1: Send initial request, expect tool call
print(f"[{timestamp()}] Step 1: Sending initial request...")
if streaming:
tool_calls = []
tool_call_id = None
tool_call_name = None
accumulated_args = ""
with client.stream(
"POST",
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": True,
"max_tokens": 512,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
) as response:
print(f"[{timestamp()}] Response status: {response.status_code}")
for line in response.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if chunk.get("choices"):
delta = chunk["choices"][0].get("delta", {})
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
idx = tc.get("index", 0)
if tc.get("id"):
tool_call_id = tc["id"]
if tc.get("function", {}).get("name"):
tool_call_name = tc["function"]["name"]
print(f"[{timestamp()}] Tool call: {tool_call_name}")
if tc.get("function", {}).get("arguments"):
accumulated_args += tc["function"]["arguments"]
if delta.get("content"):
print(f"[{timestamp()}] Content: {delta['content'][:100]}")
except json.JSONDecodeError as e:
print(f"[{timestamp()}] JSON error: {e}")
if tool_call_name:
tool_calls.append({
"id": tool_call_id or "call_0",
"type": "function",
"function": {
"name": tool_call_name,
"arguments": accumulated_args
}
})
else:
# Non-streaming
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 512,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
print(f"[{timestamp()}] Response status: {response.status_code}")
tool_calls = []
if result.get("choices"):
message = result["choices"][0].get("message", {})
if message.get("tool_calls"):
tool_calls = message["tool_calls"]
for tc in tool_calls:
print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
# Check if we got a tool call
if not tool_calls:
print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
return {"success": False, "reason": "no_tool_call"}
# Step 2: Parse tool call and prepare response
tc = tool_calls[0]
tc_id = tc.get("id", "call_0")
tc_name = tc["function"]["name"]
tc_args = json.loads(tc["function"]["arguments"])
print(f"\n[{timestamp()}] Step 2: Tool call received")
print(f" Name: {tc_name}")
print(f" Args: {tc_args}")
# Simulate tool execution
tool_result = {
"location": tc_args.get("location", "Unknown"),
"temperature": "22°C",
"condition": "Partly cloudy",
"humidity": "65%",
"wind": "15 km/h NE"
}
# Step 3: Send the tool response back
messages.append({
"role": "assistant",
"tool_calls": tool_calls
})
messages.append({
"role": "tool",
"tool_call_id": tc_id,
"content": json.dumps(tool_result)
})
print(f"\n[{timestamp()}] Step 3: Sending tool response...")
print(f" Tool call ID: {tc_id}")
print(f" Tool result: {json.dumps(tool_result, indent=2)}")
# Step 4: Get the model's follow-up response
if streaming:
final_response = ""
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
with client.stream(
"POST",
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": True,
"max_tokens": 512,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
) as response:
for line in response.iter_lines():
if not line or line == "data: [DONE]":
continue
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
if chunk.get("choices"):
delta = chunk["choices"][0].get("delta", {})
if delta.get("content"):
content = delta["content"]
final_response += content
print(f"[{timestamp()}] Content: {content}", end="", flush=True)
except json.JSONDecodeError:
pass
print() # newline after streaming output
else:
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": False,
"max_tokens": 512,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
final_response = ""
if result.get("choices"):
final_response = result["choices"][0].get("message", {}).get("content", "")
print(f"\n[{timestamp()}] Final response:\n{final_response}")
# Check if the model used the tool data
success = True
issues = []
# The response should mention the weather data
if "22" not in final_response and "22°C" not in final_response:
issues.append("Temperature (22°C) not mentioned in response")
success = False
if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
issues.append("Condition (Partly cloudy) not mentioned in response")
success = False
# Check for signs the model didn't see the data
blank_indicators = [
"i don't have",
"i cannot access",
"i'm unable to",
"i am unable to",
"don't have access",
"don't have real-time",
"cannot provide real-time"
]
for indicator in blank_indicators:
if indicator in final_response.lower():
issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
success = False
break
print(f"\n{'='*60}")
if success:
print("✓ PASS: Model correctly used tool response data")
else:
print("✗ FAIL: Model did not use tool response correctly")
for issue in issues:
print(f" - {issue}")
print(f"{'='*60}\n")
return {
"success": success,
"issues": issues,
"final_response": final_response
}
def test_tool_response_with_debug_info():
"""
Test with detailed logging to capture exactly what the model sees.
"""
tools = [
{
"type": "function",
"function": {
"name": "get_time",
"description": "Get the current time",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
}
]
print(f"\n{'='*60}")
print(f"TEST: Tool response with debug info (non-streaming)")
print(f"{'='*60}\n")
messages = [
{"role": "user", "content": "What time is it?"}
]
with httpx.Client(timeout=120.0) as client:
# Get tool call
print(f"[{timestamp()}] Sending initial request...")
response = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result = response.json()
if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
print("No tool call - skipping test")
return
tool_call = result["choices"][0]["message"]["tool_calls"][0]
tc_id = tool_call["id"]
print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
print(f"[{timestamp()}] Tool call ID: {tc_id}")
# Add tool response
messages.append({
"role": "assistant",
"tool_calls": [tool_call]
})
messages.append({
"role": "tool",
"tool_call_id": tc_id,
"content": "The current time is 3:45 PM on Thursday, April 9, 2026."
})
# Debug: print the full messages array we're about to send
print(f"\n[{timestamp()}] Sending follow-up with these messages:")
print(json.dumps(messages, indent=2))
# Get follow-up
response2 = client.post(
f"{API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"model": MODEL,
"messages": messages,
"tools": tools,
"stream": False,
"max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False},
"logprobs": True,
"top_logprobs": 5
}
)
result2 = response2.json()
print(f"\n[{timestamp()}] Full response:")
print(json.dumps(result2, indent=2))
if result2.get("choices"):
content = result2["choices"][0].get("message", {}).get("content", "")
print(f"\n[{timestamp()}] Model response content: {content}")
# Check if time is mentioned
if "3:45" in content or "3:45 PM" in content:
print("\n✓ Model used the tool response (time mentioned)")
else:
print("\n✗ Model may not have seen the tool response (time not mentioned)")
def main():
print("\n" + "="*60)
print("GLM-5.1 Tool Call Response Tests")
print("="*60)
# Test non-streaming first (simpler to debug)
print("\n--- Test 1: Non-streaming tool response flow ---")
test_tool_call_response_flow(streaming=False)
# Test streaming
print("\n--- Test 2: Streaming tool response flow ---")
test_tool_call_response_flow(streaming=True)
# Debug test
print("\n--- Test 3: Debug info test ---")
test_tool_response_with_debug_info()
print("\nAll tests complete.")
if __name__ == "__main__":
main()