consolidate to run_suite.py: single pluggable test suite, all models 84/84
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1 +1,3 @@
|
|||||||
.env
|
.env
|
||||||
|
models.env
|
||||||
|
__pycache__/
|
||||||
|
|||||||
815
run_suite.py
Normal file
815
run_suite.py
Normal file
@@ -0,0 +1,815 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Universal model tool-call test suite.
|
||||||
|
|
||||||
|
Tests any OpenAI-compatible endpoint for:
|
||||||
|
1. Basic chat (non-streaming + streaming)
|
||||||
|
2. Tool calls (non-streaming + streaming)
|
||||||
|
3. Multi-turn tool response flow (non-streaming + streaming)
|
||||||
|
4. Nested/bad tool schema handling (SGLang compatibility)
|
||||||
|
5. Streaming tool call chunking (are args actually streamed?)
|
||||||
|
6. Param sweep (what vLLM params does the endpoint accept?)
|
||||||
|
|
||||||
|
Handles reasoning models (content in 'reasoning' field, null 'content'),
|
||||||
|
different finish_reason values, and empty/tool_calls arrays gracefully.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py
|
||||||
|
python3 run_suite.py --all
|
||||||
|
python3 run_suite.py --model 1
|
||||||
|
python3 run_suite.py --filter Devstral
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import httpx
|
||||||
|
import argparse
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def ts():
|
||||||
|
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||||
|
|
||||||
|
|
||||||
|
def safe_choice(body: dict, index: int = 0) -> dict:
|
||||||
|
"""Safely get a choice from a response body."""
|
||||||
|
choices = body.get("choices") or []
|
||||||
|
if index < len(choices):
|
||||||
|
return choices[index]
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def safe_message(body: dict) -> dict:
|
||||||
|
"""Safely get the message from the first choice."""
|
||||||
|
return safe_choice(body).get("message") or {}
|
||||||
|
|
||||||
|
|
||||||
|
def safe_delta(chunk: dict) -> dict:
|
||||||
|
"""Safely get the delta from the first choice of a streaming chunk."""
|
||||||
|
choices = chunk.get("choices") or []
|
||||||
|
if choices:
|
||||||
|
return choices[0].get("delta") or {}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_content(msg: dict) -> tuple[str, str]:
|
||||||
|
"""Extract (content, reasoning) from a message, handling nulls."""
|
||||||
|
content = msg.get("content") or ""
|
||||||
|
reasoning = msg.get("reasoning") or ""
|
||||||
|
return content, reasoning
|
||||||
|
|
||||||
|
|
||||||
|
# ── Config ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelConfig:
|
||||||
|
api_base: str
|
||||||
|
api_key: str
|
||||||
|
model: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self):
|
||||||
|
return self.model.split("/")[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def load_models_env(path: Path) -> list[ModelConfig]:
|
||||||
|
"""Load models from the models.env file (pipe-delimited)."""
|
||||||
|
configs = []
|
||||||
|
for line in path.read_text().splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
parts = [p.strip() for p in line.split("|")]
|
||||||
|
if len(parts) >= 3:
|
||||||
|
configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2]))
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
def config_from_env() -> ModelConfig | None:
|
||||||
|
"""Get a single config from TOOLTEST_* environment variables."""
|
||||||
|
base = os.environ.get("TOOLTEST_API_BASE")
|
||||||
|
key = os.environ.get("TOOLTEST_API_KEY")
|
||||||
|
model = os.environ.get("TOOLTEST_MODEL")
|
||||||
|
if base and key and model:
|
||||||
|
return ModelConfig(api_base=base, api_key=key, model=model)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Test result types ────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TestResult:
|
||||||
|
name: str
|
||||||
|
passed: bool
|
||||||
|
detail: str = ""
|
||||||
|
duration_s: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SuiteResult:
|
||||||
|
model: str
|
||||||
|
results: list[TestResult] = field(default_factory=list)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def passed(self):
|
||||||
|
return sum(1 for r in self.results if r.passed)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total(self):
|
||||||
|
return len(self.results)
|
||||||
|
|
||||||
|
|
||||||
|
def make_client(cfg: ModelConfig) -> httpx.Client:
|
||||||
|
return httpx.Client(
|
||||||
|
timeout=120.0,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {cfg.api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Shared tool definitions ──────────────────────────────────
|
||||||
|
|
||||||
|
WEATHER_TOOL = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get the current weather for a location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
||||||
|
},
|
||||||
|
"required": ["location"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
WRITE_FILE_TOOL = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "write_file",
|
||||||
|
"description": "Write content to a file.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"filename": {"type": "string", "description": "Name of the file"},
|
||||||
|
"content": {"type": "string", "description": "The content to write"}
|
||||||
|
},
|
||||||
|
"required": ["filename", "content"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BAD_SCHEMA_TOOL = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "web_search",
|
||||||
|
"description": "Search the web",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": [] # Invalid — should be {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NESTED_BAD_SCHEMA_TOOL = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "message",
|
||||||
|
"description": "Send a message",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"fields": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": [] # Invalid — should be {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Test functions ───────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_basic_nonstream(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""1. Basic non-streaming chat."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "Say hello in one word."}],
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 64,
|
||||||
|
})
|
||||||
|
body = r.json()
|
||||||
|
dur = time.time() - start
|
||||||
|
if r.status_code != 200:
|
||||||
|
return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
|
||||||
|
content, reasoning = extract_content(safe_message(body))
|
||||||
|
fr = safe_choice(body).get("finish_reason", "?")
|
||||||
|
if content:
|
||||||
|
return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur)
|
||||||
|
elif reasoning:
|
||||||
|
return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur)
|
||||||
|
else:
|
||||||
|
return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_stream(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""2. Basic streaming chat."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 64,
|
||||||
|
}) as r:
|
||||||
|
if r.status_code != 200:
|
||||||
|
body = "".join(r.iter_lines())
|
||||||
|
dur = time.time() - start
|
||||||
|
return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur)
|
||||||
|
full_content = ""
|
||||||
|
full_reasoning = ""
|
||||||
|
for line in r.iter_lines():
|
||||||
|
if not line or line == "data: [DONE]":
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
try:
|
||||||
|
chunk = json.loads(line[6:])
|
||||||
|
delta = safe_delta(chunk)
|
||||||
|
if delta.get("content"):
|
||||||
|
full_content += delta["content"]
|
||||||
|
if delta.get("reasoning"):
|
||||||
|
full_reasoning += delta["reasoning"]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
dur = time.time() - start
|
||||||
|
if full_content:
|
||||||
|
return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur)
|
||||||
|
elif full_reasoning:
|
||||||
|
return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur)
|
||||||
|
else:
|
||||||
|
return TestResult("basic stream", False, "No content or reasoning received", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("basic stream", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""3. Tool call — non-streaming."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||||
|
"tools": [WEATHER_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 256,
|
||||||
|
})
|
||||||
|
body = r.json()
|
||||||
|
dur = time.time() - start
|
||||||
|
if r.status_code != 200:
|
||||||
|
return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
|
||||||
|
msg = safe_message(body)
|
||||||
|
tool_calls = msg.get("tool_calls") or []
|
||||||
|
if tool_calls:
|
||||||
|
tc = tool_calls[0]
|
||||||
|
fn = tc.get("function", {})
|
||||||
|
return TestResult("tool call non-stream", True,
|
||||||
|
f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur)
|
||||||
|
else:
|
||||||
|
content, reasoning = extract_content(msg)
|
||||||
|
out = content or reasoning or "(empty)"
|
||||||
|
return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_toolcall_stream(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""4. Tool call — streaming."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||||
|
"tools": [WEATHER_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 256,
|
||||||
|
}) as r:
|
||||||
|
if r.status_code != 200:
|
||||||
|
body = "".join(r.iter_lines())
|
||||||
|
dur = time.time() - start
|
||||||
|
return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur)
|
||||||
|
tool_name = None
|
||||||
|
accumulated_args = ""
|
||||||
|
content_parts = ""
|
||||||
|
reasoning_parts = ""
|
||||||
|
for line in r.iter_lines():
|
||||||
|
if not line or line == "data: [DONE]":
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
try:
|
||||||
|
chunk = json.loads(line[6:])
|
||||||
|
delta = safe_delta(chunk)
|
||||||
|
tc_list = delta.get("tool_calls") or []
|
||||||
|
for tc in tc_list:
|
||||||
|
fn = tc.get("function") or {}
|
||||||
|
if fn.get("name"):
|
||||||
|
tool_name = fn["name"]
|
||||||
|
if fn.get("arguments"):
|
||||||
|
accumulated_args += fn["arguments"]
|
||||||
|
if delta.get("content"):
|
||||||
|
content_parts += delta["content"]
|
||||||
|
if delta.get("reasoning"):
|
||||||
|
reasoning_parts += delta["reasoning"]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
dur = time.time() - start
|
||||||
|
if tool_name:
|
||||||
|
return TestResult("tool call stream", True,
|
||||||
|
f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur)
|
||||||
|
else:
|
||||||
|
out = content_parts or reasoning_parts or "(empty)"
|
||||||
|
return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult:
|
||||||
|
"""5/6. Full tool call → response → follow-up flow."""
|
||||||
|
label = "tool response flow (stream)" if streaming else "tool response flow"
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
|
||||||
|
|
||||||
|
# Step 1: Get tool call
|
||||||
|
if not streaming:
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": messages,
|
||||||
|
"tools": [WEATHER_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 256,
|
||||||
|
})
|
||||||
|
body = r.json()
|
||||||
|
if r.status_code != 200:
|
||||||
|
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
|
||||||
|
msg = safe_message(body)
|
||||||
|
else:
|
||||||
|
tool_name = None
|
||||||
|
tool_id = None
|
||||||
|
accumulated_args = ""
|
||||||
|
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": messages,
|
||||||
|
"tools": [WEATHER_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 256,
|
||||||
|
}) as r:
|
||||||
|
if r.status_code != 200:
|
||||||
|
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
|
||||||
|
for line in r.iter_lines():
|
||||||
|
if not line or line == "data: [DONE]":
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
try:
|
||||||
|
chunk = json.loads(line[6:])
|
||||||
|
delta = safe_delta(chunk)
|
||||||
|
for tc in (delta.get("tool_calls") or []):
|
||||||
|
if tc.get("id"):
|
||||||
|
tool_id = tc["id"]
|
||||||
|
fn = tc.get("function") or {}
|
||||||
|
if fn.get("name"):
|
||||||
|
tool_name = fn["name"]
|
||||||
|
if fn.get("arguments"):
|
||||||
|
accumulated_args += fn["arguments"]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
if not tool_name:
|
||||||
|
return TestResult(label, False, "No tool call in step 1", time.time() - start)
|
||||||
|
msg = {
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": [{
|
||||||
|
"id": tool_id or "call_0",
|
||||||
|
"type": "function",
|
||||||
|
"function": {"name": tool_name, "arguments": accumulated_args}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
tool_calls = msg.get("tool_calls") or []
|
||||||
|
if not tool_calls:
|
||||||
|
return TestResult(label, False, "No tool call in step 1", time.time() - start)
|
||||||
|
|
||||||
|
tc = tool_calls[0]
|
||||||
|
tc_id = tc.get("id", "call_0")
|
||||||
|
|
||||||
|
# Step 2: Send tool response
|
||||||
|
messages.append(msg)
|
||||||
|
messages.append({
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": tc_id,
|
||||||
|
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Step 3: Get follow-up
|
||||||
|
r2 = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": messages,
|
||||||
|
"tools": [WEATHER_TOOL],
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 256,
|
||||||
|
})
|
||||||
|
body2 = r2.json()
|
||||||
|
dur = time.time() - start
|
||||||
|
if r2.status_code != 200:
|
||||||
|
return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur)
|
||||||
|
|
||||||
|
final_msg = safe_message(body2)
|
||||||
|
final_content, final_reasoning = extract_content(final_msg)
|
||||||
|
final = final_content or final_reasoning or ""
|
||||||
|
|
||||||
|
# Check the model actually used the tool data
|
||||||
|
ok = "22" in final
|
||||||
|
indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"]
|
||||||
|
for ind in indicators:
|
||||||
|
if ind in final.lower():
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
if not final_content and final_reasoning:
|
||||||
|
return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur)
|
||||||
|
return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult(label, False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bad_tool_schema(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""7. OpenClaw-style tool with properties=[] (tests schema validation/middleware)."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "Search for cats"}],
|
||||||
|
"tools": [BAD_SCHEMA_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 128,
|
||||||
|
})
|
||||||
|
body = r.json()
|
||||||
|
dur = time.time() - start
|
||||||
|
if r.status_code != 200:
|
||||||
|
err = ""
|
||||||
|
try:
|
||||||
|
err = body.get("error", {}).get("message", "")[:150]
|
||||||
|
except Exception:
|
||||||
|
err = json.dumps(body)[:150]
|
||||||
|
return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
|
||||||
|
return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nested_bad_schema(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""8. Nested properties=[] inside items (the Tool 21 bug)."""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "Send a message to Bob"}],
|
||||||
|
"tools": [NESTED_BAD_SCHEMA_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 128,
|
||||||
|
})
|
||||||
|
body = r.json()
|
||||||
|
dur = time.time() - start
|
||||||
|
if r.status_code != 200:
|
||||||
|
err = ""
|
||||||
|
try:
|
||||||
|
err = body.get("error", {}).get("message", "")[:150]
|
||||||
|
except Exception:
|
||||||
|
err = json.dumps(body)[:150]
|
||||||
|
return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
|
||||||
|
return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult:
|
||||||
|
"""9. Streaming tool call chunking — are args actually streamed in multiple chunks?"""
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write a Python hello world and save it using the write_file tool."
|
||||||
|
}],
|
||||||
|
"tools": [WRITE_FILE_TOOL],
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 1024,
|
||||||
|
}) as r:
|
||||||
|
if r.status_code != 200:
|
||||||
|
dur = time.time() - start
|
||||||
|
return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur)
|
||||||
|
|
||||||
|
tool_name = None
|
||||||
|
arg_chunks = 0
|
||||||
|
accumulated_args = ""
|
||||||
|
content_chunks = 0
|
||||||
|
reasoning_chunks = 0
|
||||||
|
for line in r.iter_lines():
|
||||||
|
if not line or line == "data: [DONE]":
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
try:
|
||||||
|
chunk = json.loads(line[6:])
|
||||||
|
delta = safe_delta(chunk)
|
||||||
|
for tc in (delta.get("tool_calls") or []):
|
||||||
|
fn = tc.get("function") or {}
|
||||||
|
if fn.get("name"):
|
||||||
|
tool_name = fn["name"]
|
||||||
|
if fn.get("arguments"):
|
||||||
|
arg_chunks += 1
|
||||||
|
accumulated_args += fn["arguments"]
|
||||||
|
if delta.get("content"):
|
||||||
|
content_chunks += 1
|
||||||
|
if delta.get("reasoning"):
|
||||||
|
reasoning_chunks += 1
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
dur = time.time() - start
|
||||||
|
if not tool_name:
|
||||||
|
if content_chunks > 0 or reasoning_chunks > 0:
|
||||||
|
return TestResult("streaming tool chunking", False,
|
||||||
|
f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur)
|
||||||
|
return TestResult("streaming tool chunking", False, "No tool call and no content", dur)
|
||||||
|
|
||||||
|
# Evaluate chunking quality
|
||||||
|
if arg_chunks > 1:
|
||||||
|
return TestResult("streaming tool chunking", True,
|
||||||
|
f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur)
|
||||||
|
elif arg_chunks == 1 and len(accumulated_args) > 500:
|
||||||
|
return TestResult("streaming tool chunking", False,
|
||||||
|
f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur)
|
||||||
|
elif arg_chunks == 1:
|
||||||
|
return TestResult("streaming tool chunking", True,
|
||||||
|
f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur)
|
||||||
|
else:
|
||||||
|
return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur)
|
||||||
|
except Exception as e:
|
||||||
|
return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def test_param_sweep(cfg: ModelConfig) -> list[TestResult]:
|
||||||
|
"""10. Parameter sweep — which vLLM params does the endpoint accept?"""
|
||||||
|
results = []
|
||||||
|
base_req = {
|
||||||
|
"model": cfg.model,
|
||||||
|
"messages": [{"role": "user", "content": "Say hi."}],
|
||||||
|
"stream": False,
|
||||||
|
"max_tokens": 32,
|
||||||
|
}
|
||||||
|
extra_params = [
|
||||||
|
("chat_template_kwargs", {"enable_thinking": False}),
|
||||||
|
("guided_json", None),
|
||||||
|
("guided_regex", None),
|
||||||
|
("response_format", {"type": "json_object"}),
|
||||||
|
("n", 1),
|
||||||
|
("presence_penalty", 0.0),
|
||||||
|
("frequency_penalty", 0.0),
|
||||||
|
("top_p", 1.0),
|
||||||
|
("temperature", 0.7),
|
||||||
|
("seed", 42),
|
||||||
|
("stop", ["\n"]),
|
||||||
|
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
|
||||||
|
]
|
||||||
|
|
||||||
|
with make_client(cfg) as c:
|
||||||
|
for name, val in extra_params:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
if isinstance(val, dict):
|
||||||
|
req = {**base_req, **val}
|
||||||
|
else:
|
||||||
|
req = {**base_req, name: val}
|
||||||
|
r = c.post(f"{cfg.api_base}/chat/completions", json=req)
|
||||||
|
dur = time.time() - start
|
||||||
|
ok = r.status_code == 200
|
||||||
|
detail = f"HTTP {r.status_code}"
|
||||||
|
if not ok:
|
||||||
|
try:
|
||||||
|
detail += f": {r.json().get('error', {}).get('message', '')[:80]}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
results.append(TestResult(f"param: {name}", ok, detail, dur))
|
||||||
|
except Exception as e:
|
||||||
|
results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# ── Suite runner ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
ALL_TESTS = [
|
||||||
|
test_basic_nonstream,
|
||||||
|
test_basic_stream,
|
||||||
|
test_toolcall_nonstream,
|
||||||
|
test_toolcall_stream,
|
||||||
|
lambda cfg: test_tool_response_flow(cfg, streaming=False),
|
||||||
|
lambda cfg: test_tool_response_flow(cfg, streaming=True),
|
||||||
|
test_bad_tool_schema,
|
||||||
|
test_nested_bad_schema,
|
||||||
|
test_streaming_tool_chunks,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult:
|
||||||
|
"""Run the full test suite against one model config."""
|
||||||
|
result = SuiteResult(model=cfg.model)
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Testing: {cfg.model}")
|
||||||
|
print(f"API: {cfg.api_base}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
for test_fn in ALL_TESTS:
|
||||||
|
name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__
|
||||||
|
if verbose:
|
||||||
|
print(f"\n[{ts()}] Running: {name}...")
|
||||||
|
|
||||||
|
tr = test_fn(cfg)
|
||||||
|
if isinstance(tr, list):
|
||||||
|
result.results.extend(tr)
|
||||||
|
else:
|
||||||
|
result.results.append(tr)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
if isinstance(tr, list):
|
||||||
|
for r in tr:
|
||||||
|
s = "✓" if r.passed else "✗"
|
||||||
|
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
|
||||||
|
else:
|
||||||
|
s = "✓" if tr.passed else "✗"
|
||||||
|
print(f" {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)")
|
||||||
|
|
||||||
|
# Param sweep
|
||||||
|
if verbose:
|
||||||
|
print(f"\n[{ts()}] Running: parameter sweep...")
|
||||||
|
sweep_results = test_param_sweep(cfg)
|
||||||
|
result.results.extend(sweep_results)
|
||||||
|
if verbose:
|
||||||
|
for r in sweep_results:
|
||||||
|
s = "✓" if r.passed else "✗"
|
||||||
|
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(results: list[SuiteResult]):
|
||||||
|
"""Print a final summary across all models."""
|
||||||
|
print(f"\n\n{'='*60}")
|
||||||
|
print("FINAL SUMMARY")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
for sr in results:
|
||||||
|
passed = sr.passed
|
||||||
|
total = sr.total
|
||||||
|
pct = (passed / total * 100) if total else 0
|
||||||
|
label = sr.model.split("/")[-1]
|
||||||
|
print(f"\n {label}: {passed}/{total} passed ({pct:.0f}%)")
|
||||||
|
|
||||||
|
for r in sr.results:
|
||||||
|
if not r.passed:
|
||||||
|
print(f" ✗ {r.name}: {r.detail[:80]}")
|
||||||
|
|
||||||
|
# Cross-model comparison for key tests
|
||||||
|
print(f"\n{'─'*60}")
|
||||||
|
print("CROSS-MODEL COMPARISON")
|
||||||
|
print(f"{'─'*60}")
|
||||||
|
key_tests = [
|
||||||
|
"basic non-stream",
|
||||||
|
"basic stream",
|
||||||
|
"tool call non-stream",
|
||||||
|
"tool call stream",
|
||||||
|
"tool response flow",
|
||||||
|
"tool response flow (stream)",
|
||||||
|
"streaming tool chunking",
|
||||||
|
"bad tool schema (properties=[])",
|
||||||
|
"nested bad schema (items.properties=[])",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Calculate column width
|
||||||
|
labels = [sr.model.split("/")[-1][:18] for sr in results]
|
||||||
|
col_w = max(len(l) for l in labels) if labels else 16
|
||||||
|
col_w = max(col_w, 16)
|
||||||
|
|
||||||
|
header = f"{'Test':<40}"
|
||||||
|
for l in labels:
|
||||||
|
header += f" {l:>{col_w}}"
|
||||||
|
print(header)
|
||||||
|
print("─" * len(header))
|
||||||
|
|
||||||
|
for test_name in key_tests:
|
||||||
|
row = f"{test_name:<40}"
|
||||||
|
for sr in results:
|
||||||
|
match = [r for r in sr.results if r.name == test_name]
|
||||||
|
if match:
|
||||||
|
status = "✓" if match[0].passed else "✗"
|
||||||
|
row += f" {status:>{col_w}}"
|
||||||
|
else:
|
||||||
|
row += f" {'—':>{col_w}}"
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── CLI ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Universal model tool-call test suite")
|
||||||
|
parser.add_argument("--all", action="store_true", help="Test all models from models.env")
|
||||||
|
parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env")
|
||||||
|
parser.add_argument("--filter", type=str, help="Test models matching substring")
|
||||||
|
parser.add_argument("--quiet", action="store_true", help="Less output per test")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
models_path = Path(__file__).parent / "models.env"
|
||||||
|
|
||||||
|
configs: list[ModelConfig] = []
|
||||||
|
|
||||||
|
if args.all:
|
||||||
|
if not models_path.exists():
|
||||||
|
print("ERROR: models.env not found")
|
||||||
|
sys.exit(1)
|
||||||
|
configs = load_models_env(models_path)
|
||||||
|
elif args.model:
|
||||||
|
if not models_path.exists():
|
||||||
|
print("ERROR: models.env not found")
|
||||||
|
sys.exit(1)
|
||||||
|
all_configs = load_models_env(models_path)
|
||||||
|
if args.model < 1 or args.model > len(all_configs):
|
||||||
|
print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})")
|
||||||
|
sys.exit(1)
|
||||||
|
configs = [all_configs[args.model - 1]]
|
||||||
|
elif args.filter:
|
||||||
|
if not models_path.exists():
|
||||||
|
print("ERROR: models.env not found")
|
||||||
|
sys.exit(1)
|
||||||
|
all_configs = load_models_env(models_path)
|
||||||
|
configs = [c for c in all_configs if args.filter.lower() in c.model.lower()]
|
||||||
|
if not configs:
|
||||||
|
print(f"No models matching '{args.filter}'")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
cfg = config_from_env()
|
||||||
|
if cfg:
|
||||||
|
configs = [cfg]
|
||||||
|
else:
|
||||||
|
print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.")
|
||||||
|
if models_path.exists():
|
||||||
|
print("\nAvailable models from models.env:")
|
||||||
|
for i, c in enumerate(load_models_env(models_path), 1):
|
||||||
|
print(f" {i}. {c.model} @ {c.api_base}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
all_results: list[SuiteResult] = []
|
||||||
|
for cfg in configs:
|
||||||
|
sr = run_suite(cfg, verbose=not args.quiet)
|
||||||
|
all_results.append(sr)
|
||||||
|
|
||||||
|
print_summary(all_results)
|
||||||
|
|
||||||
|
if any(sr.passed < sr.total for sr in all_results):
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
23
run_tests.sh
23
run_tests.sh
@@ -1,19 +1,14 @@
|
|||||||
#!/bin/bash
|
#!/usr/bin/env bash
|
||||||
# Run the streaming tool call tests
|
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
# Default values
|
# Usage:
|
||||||
export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}"
|
# ./run_tests.sh # Test all models from models.env
|
||||||
export VLLM_API_KEY="${VLLM_API_KEY:-none}"
|
# ./run_tests.sh --model 1 # Test model #1
|
||||||
export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}"
|
# ./run_tests.sh --filter Devstral # Test matching models
|
||||||
|
# ./run_tests.sh --all # Same as no args
|
||||||
|
# ./run_tests.sh --quiet # Less output
|
||||||
|
|
||||||
echo "Configuration:"
|
cd "$SCRIPT_DIR"
|
||||||
echo " API_BASE: $VLLM_API_BASE"
|
python3 -u run_suite.py "$@"
|
||||||
echo " MODEL: $VLLM_MODEL"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
python3 "$SCRIPT_DIR/test_streaming_tool_calls.py"
|
|
||||||
|
|||||||
546
test_devstral.py
546
test_devstral.py
@@ -1,546 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
|
|
||||||
|
|
||||||
These tests send EXACTLY what OpenClaw would send to vLLM — including
|
|
||||||
chat_template_kwargs, logprobs, weird tool schemas, the works.
|
|
||||||
The middleware's job is to strip/fix all of it so SGLang doesn't choke.
|
|
||||||
|
|
||||||
Architecture: this test → middleware (strips bad params) → SGLang
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import httpx
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Load .env if present (don't hardcode keys)
|
|
||||||
_env_file = Path(__file__).parent / ".env"
|
|
||||||
if _env_file.exists():
|
|
||||||
for line in _env_file.read_text().splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line or line.startswith("#") or "=" not in line:
|
|
||||||
continue
|
|
||||||
k, v = line.split("=", 1)
|
|
||||||
os.environ.setdefault(k.strip(), v.strip())
|
|
||||||
|
|
||||||
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
|
|
||||||
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
|
|
||||||
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
|
|
||||||
|
|
||||||
RESULTS = []
|
|
||||||
|
|
||||||
|
|
||||||
def ts():
|
|
||||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
||||||
|
|
||||||
|
|
||||||
def record(name, ok, detail=""):
|
|
||||||
status = "✓ PASS" if ok else "✗ FAIL"
|
|
||||||
print(f"\n{status}: {name}")
|
|
||||||
if detail:
|
|
||||||
print(f" {detail}")
|
|
||||||
RESULTS.append({"name": name, "pass": ok, "detail": detail})
|
|
||||||
|
|
||||||
|
|
||||||
def make_client():
|
|
||||||
return httpx.Client(
|
|
||||||
timeout=120.0,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1. Basic non-streaming chat ──────────────────────────────
|
|
||||||
|
|
||||||
def test_basic_nonstream():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Basic non-streaming chat")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "Say hello in one word."}],
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 32,
|
|
||||||
})
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
|
||||||
record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
||||||
return
|
|
||||||
content = body["choices"][0]["message"]["content"]
|
|
||||||
print(f"[{ts()}] Reply: {content[:100]}")
|
|
||||||
record("basic non-stream", True, f"Got: {content[:80]}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 2. Basic streaming chat ──────────────────────────────────
|
|
||||||
|
|
||||||
def test_basic_stream():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Basic streaming chat")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 64,
|
|
||||||
}) as r:
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
if r.status_code != 200:
|
|
||||||
body = "".join(r.iter_lines())
|
|
||||||
print(f"[{ts()}] Error: {body[:300]}")
|
|
||||||
record("basic stream", False, f"HTTP {r.status_code}")
|
|
||||||
return
|
|
||||||
full = ""
|
|
||||||
for line in r.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
if line.startswith("data: "):
|
|
||||||
try:
|
|
||||||
chunk = json.loads(line[6:])
|
|
||||||
if not chunk.get("choices"): continue
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
if delta.get("content"):
|
|
||||||
full += delta["content"]
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
print(f"[{ts()}] Reply: {full[:100]}")
|
|
||||||
record("basic stream", True, f"Got: {full[:80]}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
|
|
||||||
|
|
||||||
def test_toolcall_nonstream():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_weather",
|
|
||||||
"description": "Get the current weather for a location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
||||||
},
|
|
||||||
"required": ["location"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
})
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
|
||||||
record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
||||||
return
|
|
||||||
msg = body["choices"][0]["message"]
|
|
||||||
if msg.get("tool_calls"):
|
|
||||||
tc = msg["tool_calls"][0]
|
|
||||||
print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
|
|
||||||
record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
|
|
||||||
else:
|
|
||||||
content = msg.get("content", "")
|
|
||||||
print(f"[{ts()}] No tool call. Content: {content[:200]}")
|
|
||||||
record("tool call non-stream", False, "Model did not call the tool")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 4. Tool call — streaming ────────────────────────────────
|
|
||||||
|
|
||||||
def test_toolcall_stream():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Tool call streaming")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_weather",
|
|
||||||
"description": "Get the current weather for a location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
||||||
},
|
|
||||||
"required": ["location"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 256,
|
|
||||||
}) as r:
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
if r.status_code != 200:
|
|
||||||
body = "".join(r.iter_lines())
|
|
||||||
print(f"[{ts()}] Error: {body[:300]}")
|
|
||||||
record("tool call stream", False, f"HTTP {r.status_code}")
|
|
||||||
return
|
|
||||||
tool_name = None
|
|
||||||
accumulated_args = ""
|
|
||||||
content_parts = ""
|
|
||||||
for line in r.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
if line.startswith("data: "):
|
|
||||||
try:
|
|
||||||
chunk = json.loads(line[6:])
|
|
||||||
if not chunk.get("choices"): continue
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
if delta.get("tool_calls"):
|
|
||||||
for tc in delta["tool_calls"]:
|
|
||||||
if tc.get("function", {}).get("name"):
|
|
||||||
tool_name = tc["function"]["name"]
|
|
||||||
if tc.get("function", {}).get("arguments"):
|
|
||||||
accumulated_args += tc["function"]["arguments"]
|
|
||||||
if delta.get("content"):
|
|
||||||
content_parts += delta["content"]
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if tool_name:
|
|
||||||
print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
|
|
||||||
record("tool call stream", True, f"Got tool call: {tool_name}")
|
|
||||||
else:
|
|
||||||
print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
|
|
||||||
record("tool call stream", False, "Model did not call the tool")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 5. Full tool response flow (non-streaming) ──────────────
|
|
||||||
|
|
||||||
def test_tool_response_flow():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_weather",
|
|
||||||
"description": "Get the current weather for a location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
|
||||||
},
|
|
||||||
"required": ["location"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
})
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
|
|
||||||
return
|
|
||||||
msg = body["choices"][0]["message"]
|
|
||||||
if not msg.get("tool_calls"):
|
|
||||||
record("tool response flow", False, "No tool call in step 1")
|
|
||||||
return
|
|
||||||
|
|
||||||
tc = msg["tool_calls"][0]
|
|
||||||
tc_id = tc["id"]
|
|
||||||
print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
|
|
||||||
|
|
||||||
messages.append(msg)
|
|
||||||
messages.append({
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": tc_id,
|
|
||||||
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
|
|
||||||
})
|
|
||||||
|
|
||||||
r2 = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
})
|
|
||||||
body2 = r2.json()
|
|
||||||
if r2.status_code != 200:
|
|
||||||
print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
|
|
||||||
record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
|
|
||||||
return
|
|
||||||
|
|
||||||
final = body2["choices"][0]["message"].get("content", "")
|
|
||||||
print(f"[{ts()}] Final: {final[:200]}")
|
|
||||||
ok = "22" in final
|
|
||||||
record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
|
|
||||||
|
|
||||||
def test_param_sweep():
|
|
||||||
"""
|
|
||||||
Sends EVERY param that OpenClaw or vLLM might include.
|
|
||||||
The middleware must strip/fix the ones SGLang rejects.
|
|
||||||
"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
base_req = {
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "Say hi."}],
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 32,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Params that OpenClaw/vLLM might send — some SGLang rejects
|
|
||||||
extra_params = [
|
|
||||||
("chat_template_kwargs", {"enable_thinking": False}),
|
|
||||||
("guided_json", None),
|
|
||||||
("guided_regex", None),
|
|
||||||
("response_format", {"type": "json_object"}),
|
|
||||||
("n", 1),
|
|
||||||
("presence_penalty", 0.0),
|
|
||||||
("frequency_penalty", 0.0),
|
|
||||||
("top_p", 1.0),
|
|
||||||
("temperature", 0.7),
|
|
||||||
("seed", 42),
|
|
||||||
("stop", ["\n"]),
|
|
||||||
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
|
|
||||||
("top_logprobs", 5),
|
|
||||||
]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
# baseline
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json=base_req)
|
|
||||||
print(f"[{ts()}] Baseline: {r.status_code}")
|
|
||||||
|
|
||||||
for name, val in extra_params:
|
|
||||||
req = {**base_req, name: val}
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json=req)
|
|
||||||
status = "✓" if r.status_code == 200 else "✗"
|
|
||||||
detail = ""
|
|
||||||
if r.status_code != 200:
|
|
||||||
try:
|
|
||||||
detail = r.json().get("error", {}).get("message", "")[:100]
|
|
||||||
except Exception:
|
|
||||||
detail = r.text[:100]
|
|
||||||
print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
|
|
||||||
if r.status_code != 200:
|
|
||||||
record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
|
|
||||||
|
|
||||||
def test_openclaw_tool_schema():
|
|
||||||
"""
|
|
||||||
Reproduce the exact tool schema that OpenClaw sends which has
|
|
||||||
parameters.properties = [] instead of {}. Middleware must fix it.
|
|
||||||
"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
# This is the exact shape OpenClaw sends for tools with no params
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "web_search",
|
|
||||||
"description": "Search the web",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": [] # <-- THIS is what causes the 400
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "Search for cats"}],
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 128,
|
|
||||||
})
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
|
||||||
record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
||||||
return
|
|
||||||
print(f"[{ts()}] Success — middleware fixed the bad schema")
|
|
||||||
record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 8. Nested properties=[] in tool schema (Tool 21 bug) ────
|
|
||||||
|
|
||||||
def test_nested_bad_properties():
|
|
||||||
"""
|
|
||||||
Reproduce the exact Tool 21 400 error:
|
|
||||||
schema['properties']['fields']['items']['properties'] = []
|
|
||||||
|
|
||||||
This happens when a tool has an array-of-objects parameter where
|
|
||||||
the items' properties field is [] instead of {}. The middleware
|
|
||||||
must recurse into the schema to fix ALL properties fields.
|
|
||||||
"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
# This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema:
|
|
||||||
# [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']"
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "message",
|
|
||||||
"description": "Send a message",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"fields": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": [] # <-- THIS causes the 400
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [{"role": "user", "content": "Send a message to Bob"}],
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 128,
|
|
||||||
})
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}")
|
|
||||||
record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
||||||
return
|
|
||||||
print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}")
|
|
||||||
record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}")
|
|
||||||
|
|
||||||
|
|
||||||
# ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─
|
|
||||||
|
|
||||||
def test_openclaw_full_payload():
|
|
||||||
"""
|
|
||||||
The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
|
|
||||||
Exactly what OpenClaw sends through the pipe.
|
|
||||||
"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "web_search",
|
|
||||||
"description": "Search the web using DuckDuckGo.",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": [] # Bad — middleware must fix
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
with make_client() as c:
|
|
||||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": [
|
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
|
||||||
{"role": "user", "content": "Search for the weather in NYC"},
|
|
||||||
],
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip
|
|
||||||
"logprobs": True, # Bad — middleware must strip
|
|
||||||
"top_logprobs": 5, # Bad — middleware must strip
|
|
||||||
})
|
|
||||||
print(f"[{ts()}] Status: {r.status_code}")
|
|
||||||
body = r.json()
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
|
||||||
record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
|
||||||
return
|
|
||||||
msg = body["choices"][0]["message"]
|
|
||||||
print(f"[{ts()}] Success — middleware cleaned everything")
|
|
||||||
if msg.get("tool_calls"):
|
|
||||||
tc = msg["tool_calls"][0]
|
|
||||||
print(f"[{ts()}] Tool call: {tc['function']['name']}")
|
|
||||||
else:
|
|
||||||
print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
|
|
||||||
record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
|
|
||||||
|
|
||||||
|
|
||||||
# ── Main ─────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
|
|
||||||
print(f"API: {API_BASE}")
|
|
||||||
print(f"Model: {MODEL}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
test_basic_nonstream()
|
|
||||||
test_basic_stream()
|
|
||||||
test_toolcall_nonstream()
|
|
||||||
test_toolcall_stream()
|
|
||||||
test_tool_response_flow()
|
|
||||||
test_param_sweep()
|
|
||||||
test_openclaw_tool_schema()
|
|
||||||
test_nested_bad_properties()
|
|
||||||
test_openclaw_full_payload()
|
|
||||||
|
|
||||||
print(f"\n\n{'='*60}")
|
|
||||||
print("FINAL RESULTS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
for r in RESULTS:
|
|
||||||
s = "✓" if r["pass"] else "✗"
|
|
||||||
print(f" {s} {r['name']}: {r['detail']}")
|
|
||||||
passed = sum(1 for r in RESULTS if r["pass"])
|
|
||||||
print(f"\n {passed}/{len(RESULTS)} passed")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,395 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test suite for vLLM GLM-5.1 streaming tool calls.
|
|
||||||
|
|
||||||
Reproduces the issue where long string parameters in tool calls
|
|
||||||
are buffered entirely before being emitted during streaming.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import httpx
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
# Configuration - will be set via environment or direct assignment
|
|
||||||
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
|
|
||||||
API_KEY = os.environ.get("VLLM_API_KEY", "none")
|
|
||||||
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp():
|
|
||||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
||||||
|
|
||||||
|
|
||||||
def test_streaming_tool_call_with_code():
|
|
||||||
"""
|
|
||||||
Test streaming a tool call with a long string parameter.
|
|
||||||
|
|
||||||
This prompts the model to generate code via a tool call,
|
|
||||||
which should stream incrementally if the patch works correctly.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "write_file",
|
|
||||||
"description": "Write content to a file. Use this to save code, text, or other content.",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"filename": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the file to write"
|
|
||||||
},
|
|
||||||
"content": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The content to write to the file"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["filename", "content"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TEST: Streaming tool call with long string parameter")
|
|
||||||
print(f"API: {API_BASE}")
|
|
||||||
print(f"Model: {MODEL}")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
# Track streaming events
|
|
||||||
chunks_received = []
|
|
||||||
first_chunk_time = None
|
|
||||||
last_chunk_time = None
|
|
||||||
tool_call_chunks = []
|
|
||||||
accumulated_content = ""
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
with httpx.Client(timeout=120.0) as client:
|
|
||||||
with client.stream(
|
|
||||||
"POST",
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 4096,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
) as response:
|
|
||||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
|
||||||
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if line.startswith("data: "):
|
|
||||||
chunk_data = line[6:]
|
|
||||||
try:
|
|
||||||
chunk = json.loads(chunk_data)
|
|
||||||
|
|
||||||
if first_chunk_time is None:
|
|
||||||
first_chunk_time = time.time()
|
|
||||||
print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)")
|
|
||||||
|
|
||||||
last_chunk_time = time.time()
|
|
||||||
chunks_received.append(chunk)
|
|
||||||
|
|
||||||
# Extract delta content
|
|
||||||
if chunk.get("choices"):
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
|
|
||||||
# Check for tool calls in delta
|
|
||||||
if delta.get("tool_calls"):
|
|
||||||
for tc in delta["tool_calls"]:
|
|
||||||
tc_index = tc.get("index", 0)
|
|
||||||
tc_function = tc.get("function", {})
|
|
||||||
|
|
||||||
if tc_function.get("name"):
|
|
||||||
print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}")
|
|
||||||
|
|
||||||
if tc_function.get("arguments"):
|
|
||||||
args_chunk = tc_function["arguments"]
|
|
||||||
tool_call_chunks.append(args_chunk)
|
|
||||||
accumulated_content += args_chunk
|
|
||||||
|
|
||||||
# Print progress every ~500 chars
|
|
||||||
if len(accumulated_content) % 500 < len(args_chunk):
|
|
||||||
print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...")
|
|
||||||
|
|
||||||
# Regular content
|
|
||||||
if delta.get("content"):
|
|
||||||
print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"[{timestamp()}] JSON decode error: {e}")
|
|
||||||
|
|
||||||
end_time = time.time()
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("SUMMARY")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Total chunks received: {len(chunks_received)}")
|
|
||||||
print(f"Total time: {end_time - start_time:.3f}s")
|
|
||||||
|
|
||||||
if first_chunk_time:
|
|
||||||
print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s")
|
|
||||||
|
|
||||||
if tool_call_chunks:
|
|
||||||
print(f"Tool call chunks: {len(tool_call_chunks)}")
|
|
||||||
print(f"Total tool call content: {len(accumulated_content)} chars")
|
|
||||||
|
|
||||||
# Try to parse the accumulated arguments
|
|
||||||
print(f"\nAttempting to parse tool call arguments...")
|
|
||||||
try:
|
|
||||||
args = json.loads(accumulated_content)
|
|
||||||
print(f"Successfully parsed!")
|
|
||||||
print(f" - filename: {args.get('filename', 'N/A')}")
|
|
||||||
print(f" - content length: {len(args.get('content', ''))} chars")
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"Failed to parse: {e}")
|
|
||||||
print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}")
|
|
||||||
|
|
||||||
# Verdict
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
if len(tool_call_chunks) > 1:
|
|
||||||
print("✓ PASS: Tool call arguments arrived in multiple chunks")
|
|
||||||
print(f" Chunks: {len(tool_call_chunks)}, indicating incremental streaming")
|
|
||||||
elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000:
|
|
||||||
print("✗ FAIL: Tool call arguments arrived in a single chunk")
|
|
||||||
print(" This indicates buffering, not true streaming")
|
|
||||||
else:
|
|
||||||
print("? INCONCLUSIVE: Not enough data or no tool call occurred")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"chunks_received": len(chunks_received),
|
|
||||||
"tool_call_chunks": len(tool_call_chunks),
|
|
||||||
"accumulated_length": len(accumulated_content),
|
|
||||||
"total_time": end_time - start_time
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_streaming_tool_call_with_json():
|
|
||||||
"""
|
|
||||||
Test streaming a tool call that returns structured JSON data.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "save_config",
|
|
||||||
"description": "Save a configuration object",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"config": {
|
|
||||||
"type": "object",
|
|
||||||
"description": "Configuration object with many fields"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["config"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TEST: Streaming tool call with nested JSON")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
tool_call_chunks = []
|
|
||||||
accumulated_content = ""
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
with httpx.Client(timeout=120.0) as client:
|
|
||||||
with client.stream(
|
|
||||||
"POST",
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 2048,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
) as response:
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if line.startswith("data: "):
|
|
||||||
try:
|
|
||||||
chunk = json.loads(line[6:])
|
|
||||||
if chunk.get("choices"):
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
if delta.get("tool_calls"):
|
|
||||||
for tc in delta["tool_calls"]:
|
|
||||||
if tc.get("function", {}).get("arguments"):
|
|
||||||
args_chunk = tc["function"]["arguments"]
|
|
||||||
tool_call_chunks.append(args_chunk)
|
|
||||||
accumulated_content += args_chunk
|
|
||||||
print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
end_time = time.time()
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars")
|
|
||||||
print(f"Time: {end_time - start_time:.3f}s")
|
|
||||||
|
|
||||||
if len(tool_call_chunks) > 1:
|
|
||||||
print("✓ PASS: Arguments streamed in multiple chunks")
|
|
||||||
elif len(tool_call_chunks) == 1:
|
|
||||||
print("✗ FAIL: Arguments arrived in single chunk (buffered)")
|
|
||||||
else:
|
|
||||||
print("? No tool call occurred")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
|
|
||||||
def test_non_streaming_tool_call():
|
|
||||||
"""
|
|
||||||
Baseline test: non-streaming tool call for comparison.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "write_file",
|
|
||||||
"description": "Write content to a file",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"filename": {"type": "string"},
|
|
||||||
"content": {"type": "string"}
|
|
||||||
},
|
|
||||||
"required": ["filename", "content"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write a simple Python hello world and save it using the write_file tool."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TEST: Non-streaming tool call (baseline)")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
with httpx.Client(timeout=120.0) as client:
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 1024,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
end_time = time.time()
|
|
||||||
|
|
||||||
print(f"Status: {response.status_code}")
|
|
||||||
print(f"Time: {end_time - start_time:.3f}s")
|
|
||||||
|
|
||||||
if result.get("choices"):
|
|
||||||
message = result["choices"][0].get("message", {})
|
|
||||||
if message.get("tool_calls"):
|
|
||||||
for tc in message["tool_calls"]:
|
|
||||||
print(f"Tool: {tc['function']['name']}")
|
|
||||||
args = json.loads(tc["function"]["arguments"])
|
|
||||||
print(f"Arguments parsed successfully")
|
|
||||||
print(f" - filename: {args.get('filename')}")
|
|
||||||
print(f" - content length: {len(args.get('content', ''))}")
|
|
||||||
else:
|
|
||||||
print("No tool call in response")
|
|
||||||
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("vLLM GLM-5.1 Streaming Tool Call Tests")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
# Check API connectivity
|
|
||||||
print(f"\nChecking API at {API_BASE}...")
|
|
||||||
try:
|
|
||||||
with httpx.Client(timeout=10.0) as client:
|
|
||||||
response = client.get(f"{API_BASE.replace('/v1', '')}/health")
|
|
||||||
print(f"Health check: {response.status_code}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Could not reach API - {e}")
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
print("\nRunning tests...\n")
|
|
||||||
|
|
||||||
# Test 1: Non-streaming baseline
|
|
||||||
test_non_streaming_tool_call()
|
|
||||||
|
|
||||||
# Test 2: Streaming with nested JSON
|
|
||||||
test_streaming_tool_call_with_json()
|
|
||||||
|
|
||||||
# Test 3: Main test - streaming with long code
|
|
||||||
result = test_streaming_tool_call_with_code()
|
|
||||||
|
|
||||||
print("\nAll tests complete.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,243 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Focused test to diagnose GLM-5.1 tool response issue.
|
|
||||||
|
|
||||||
The issue: Model sees tool response as blank.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
import json
|
|
||||||
|
|
||||||
API_BASE = "http://95.179.247.150/v1"
|
|
||||||
API_KEY = "whatever"
|
|
||||||
MODEL = "HuggingFaceTB/SmolLM3-3B"
|
|
||||||
|
|
||||||
|
|
||||||
def test_simple_tool_response():
|
|
||||||
"""
|
|
||||||
Minimal test: Send a tool response and see if the model can use it.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Simulate a conversation where a tool was called
|
|
||||||
messages = [
|
|
||||||
{"role": "user", "content": "Call the test function"},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": [{
|
|
||||||
"id": "call_123",
|
|
||||||
"type": "function",
|
|
||||||
"function": {"name": "test_func", "arguments": "{}"}
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": "call_123",
|
|
||||||
"content": "SUCCESS: The function returned value 42"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "test_func",
|
|
||||||
"description": "A test function",
|
|
||||||
"parameters": {"type": "object", "properties": {}}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("Request messages:")
|
|
||||||
print(json.dumps(messages, indent=2))
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
with httpx.Client(timeout=60.0) as client:
|
|
||||||
# Non-streaming to get full response
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
print("\nFull response:")
|
|
||||||
print(json.dumps(result, indent=2))
|
|
||||||
|
|
||||||
if result.get("choices"):
|
|
||||||
content = result["choices"][0].get("message", {}).get("content", "")
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Model response content:")
|
|
||||||
print(content)
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Check if the tool result is referenced
|
|
||||||
if "42" in content:
|
|
||||||
print("\n✓ PASS: Model referenced the tool result (42)")
|
|
||||||
else:
|
|
||||||
print("\n✗ FAIL: Model did NOT reference the tool result (42)")
|
|
||||||
|
|
||||||
# Check for signs the model didn't see the result
|
|
||||||
if "don't have" in content.lower() or "cannot access" in content.lower():
|
|
||||||
print("✗ Model indicates it cannot see tool result")
|
|
||||||
|
|
||||||
|
|
||||||
def test_without_tools_param():
|
|
||||||
"""
|
|
||||||
Test what happens if we don't pass tools in the follow-up request.
|
|
||||||
Some APIs need tools to be passed on every request.
|
|
||||||
"""
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{"role": "user", "content": "Call the test function"},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": [{
|
|
||||||
"id": "call_123",
|
|
||||||
"type": "function",
|
|
||||||
"function": {"name": "test_func", "arguments": "{}"}
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": "call_123",
|
|
||||||
"content": "SUCCESS: The function returned value 42"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Test WITHOUT tools param in follow-up")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
with httpx.Client(timeout=60.0) as client:
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
# No tools param
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
if result.get("choices"):
|
|
||||||
content = result["choices"][0].get("message", {}).get("content", "")
|
|
||||||
print("Model response:", content[:200])
|
|
||||||
|
|
||||||
if "42" in content:
|
|
||||||
print("✓ Model referenced the tool result")
|
|
||||||
|
|
||||||
|
|
||||||
def test_different_content_formats():
|
|
||||||
"""
|
|
||||||
Test if the issue is with how content is formatted.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Test 1: String content (standard)
|
|
||||||
messages_string = [
|
|
||||||
{"role": "user", "content": "What is 2+2?"},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": [{
|
|
||||||
"id": "call_123",
|
|
||||||
"type": "function",
|
|
||||||
"function": {"name": "calc", "arguments": "{}"}
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": "call_123",
|
|
||||||
"content": "The answer is 4"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# Test 2: Content as array (OpenAI format)
|
|
||||||
messages_array = [
|
|
||||||
{"role": "user", "content": "What is 2+2?"},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": [{
|
|
||||||
"id": "call_123",
|
|
||||||
"type": "function",
|
|
||||||
"function": {"name": "calc", "arguments": "{}"}
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": "call_123",
|
|
||||||
"content": [{"type": "text", "text": "The answer is 4"}]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
tools = [{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "calc",
|
|
||||||
"description": "Calculator",
|
|
||||||
"parameters": {"type": "object", "properties": {}}
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Test: String content vs Array content")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
with httpx.Client(timeout=60.0) as client:
|
|
||||||
for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
|
|
||||||
print(f"\n--- {name} ---")
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": msgs,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 128,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
if result.get("choices"):
|
|
||||||
content = result["choices"][0].get("message", {}).get("content", "")
|
|
||||||
print(f"Response: {content[:150]}")
|
|
||||||
if "4" in content:
|
|
||||||
print("✓ Referenced tool result")
|
|
||||||
else:
|
|
||||||
print("✗ Did NOT reference tool result")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print("GLM-5.1 Tool Response Diagnosis")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
test_simple_tool_response()
|
|
||||||
test_without_tools_param()
|
|
||||||
test_different_content_formats()
|
|
||||||
@@ -1,463 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test for tool call response handling in GLM-5.1.
|
|
||||||
|
|
||||||
Tests the multi-turn flow:
|
|
||||||
1. Send a prompt that triggers a tool call
|
|
||||||
2. Send back the tool result
|
|
||||||
3. Verify the model can see and use the tool response
|
|
||||||
|
|
||||||
This reproduces the issue where tool responses appear blank to the model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import httpx
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
|
|
||||||
API_KEY = os.environ.get("VLLM_API_KEY", "none")
|
|
||||||
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp():
|
|
||||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
||||||
|
|
||||||
|
|
||||||
def test_tool_call_response_flow(streaming: bool = True):
|
|
||||||
"""
|
|
||||||
Test the full tool call -> response -> follow-up flow.
|
|
||||||
|
|
||||||
This simulates:
|
|
||||||
1. User asks for weather
|
|
||||||
2. Model calls get_weather tool
|
|
||||||
3. We send back the weather data
|
|
||||||
4. Model should see and use that data
|
|
||||||
"""
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_weather",
|
|
||||||
"description": "Get the current weather for a location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "City and state, e.g. 'New York, NY'"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["location"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# Initial request that should trigger a tool call
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "What's the weather like in Tokyo right now?"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
mode = "STREAMING" if streaming else "NON-STREAMING"
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TEST: Tool call response flow ({mode})")
|
|
||||||
print(f"API: {API_BASE}")
|
|
||||||
print(f"Model: {MODEL}")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
with httpx.Client(timeout=120.0) as client:
|
|
||||||
# Step 1: Send initial request, expect tool call
|
|
||||||
print(f"[{timestamp()}] Step 1: Sending initial request...")
|
|
||||||
|
|
||||||
if streaming:
|
|
||||||
tool_calls = []
|
|
||||||
tool_call_id = None
|
|
||||||
tool_call_name = None
|
|
||||||
accumulated_args = ""
|
|
||||||
|
|
||||||
with client.stream(
|
|
||||||
"POST",
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 512,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
) as response:
|
|
||||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
|
||||||
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if line.startswith("data: "):
|
|
||||||
try:
|
|
||||||
chunk = json.loads(line[6:])
|
|
||||||
if chunk.get("choices"):
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
|
|
||||||
if delta.get("tool_calls"):
|
|
||||||
for tc in delta["tool_calls"]:
|
|
||||||
idx = tc.get("index", 0)
|
|
||||||
|
|
||||||
if tc.get("id"):
|
|
||||||
tool_call_id = tc["id"]
|
|
||||||
|
|
||||||
if tc.get("function", {}).get("name"):
|
|
||||||
tool_call_name = tc["function"]["name"]
|
|
||||||
print(f"[{timestamp()}] Tool call: {tool_call_name}")
|
|
||||||
|
|
||||||
if tc.get("function", {}).get("arguments"):
|
|
||||||
accumulated_args += tc["function"]["arguments"]
|
|
||||||
|
|
||||||
if delta.get("content"):
|
|
||||||
print(f"[{timestamp()}] Content: {delta['content'][:100]}")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"[{timestamp()}] JSON error: {e}")
|
|
||||||
|
|
||||||
if tool_call_name:
|
|
||||||
tool_calls.append({
|
|
||||||
"id": tool_call_id or "call_0",
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": tool_call_name,
|
|
||||||
"arguments": accumulated_args
|
|
||||||
}
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
# Non-streaming
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 512,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
|
||||||
|
|
||||||
tool_calls = []
|
|
||||||
if result.get("choices"):
|
|
||||||
message = result["choices"][0].get("message", {})
|
|
||||||
if message.get("tool_calls"):
|
|
||||||
tool_calls = message["tool_calls"]
|
|
||||||
for tc in tool_calls:
|
|
||||||
print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
|
|
||||||
print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
|
|
||||||
|
|
||||||
# Check if we got a tool call
|
|
||||||
if not tool_calls:
|
|
||||||
print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
|
|
||||||
return {"success": False, "reason": "no_tool_call"}
|
|
||||||
|
|
||||||
# Step 2: Parse tool call and prepare response
|
|
||||||
tc = tool_calls[0]
|
|
||||||
tc_id = tc.get("id", "call_0")
|
|
||||||
tc_name = tc["function"]["name"]
|
|
||||||
tc_args = json.loads(tc["function"]["arguments"])
|
|
||||||
|
|
||||||
print(f"\n[{timestamp()}] Step 2: Tool call received")
|
|
||||||
print(f" Name: {tc_name}")
|
|
||||||
print(f" Args: {tc_args}")
|
|
||||||
|
|
||||||
# Simulate tool execution
|
|
||||||
tool_result = {
|
|
||||||
"location": tc_args.get("location", "Unknown"),
|
|
||||||
"temperature": "22°C",
|
|
||||||
"condition": "Partly cloudy",
|
|
||||||
"humidity": "65%",
|
|
||||||
"wind": "15 km/h NE"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Step 3: Send the tool response back
|
|
||||||
messages.append({
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": tool_calls
|
|
||||||
})
|
|
||||||
messages.append({
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": tc_id,
|
|
||||||
"content": json.dumps(tool_result)
|
|
||||||
})
|
|
||||||
|
|
||||||
print(f"\n[{timestamp()}] Step 3: Sending tool response...")
|
|
||||||
print(f" Tool call ID: {tc_id}")
|
|
||||||
print(f" Tool result: {json.dumps(tool_result, indent=2)}")
|
|
||||||
|
|
||||||
# Step 4: Get the model's follow-up response
|
|
||||||
if streaming:
|
|
||||||
final_response = ""
|
|
||||||
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
|
|
||||||
|
|
||||||
with client.stream(
|
|
||||||
"POST",
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 512,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
) as response:
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if not line or line == "data: [DONE]":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if line.startswith("data: "):
|
|
||||||
try:
|
|
||||||
chunk = json.loads(line[6:])
|
|
||||||
if chunk.get("choices"):
|
|
||||||
delta = chunk["choices"][0].get("delta", {})
|
|
||||||
if delta.get("content"):
|
|
||||||
content = delta["content"]
|
|
||||||
final_response += content
|
|
||||||
print(f"[{timestamp()}] Content: {content}", end="", flush=True)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print() # newline after streaming output
|
|
||||||
else:
|
|
||||||
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
|
|
||||||
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 512,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
final_response = ""
|
|
||||||
if result.get("choices"):
|
|
||||||
final_response = result["choices"][0].get("message", {}).get("content", "")
|
|
||||||
|
|
||||||
print(f"\n[{timestamp()}] Final response:\n{final_response}")
|
|
||||||
|
|
||||||
# Check if the model used the tool data
|
|
||||||
success = True
|
|
||||||
issues = []
|
|
||||||
|
|
||||||
# The response should mention the weather data
|
|
||||||
if "22" not in final_response and "22°C" not in final_response:
|
|
||||||
issues.append("Temperature (22°C) not mentioned in response")
|
|
||||||
success = False
|
|
||||||
|
|
||||||
if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
|
|
||||||
issues.append("Condition (Partly cloudy) not mentioned in response")
|
|
||||||
success = False
|
|
||||||
|
|
||||||
# Check for signs the model didn't see the data
|
|
||||||
blank_indicators = [
|
|
||||||
"i don't have",
|
|
||||||
"i cannot access",
|
|
||||||
"i'm unable to",
|
|
||||||
"i am unable to",
|
|
||||||
"don't have access",
|
|
||||||
"don't have real-time",
|
|
||||||
"cannot provide real-time"
|
|
||||||
]
|
|
||||||
|
|
||||||
for indicator in blank_indicators:
|
|
||||||
if indicator in final_response.lower():
|
|
||||||
issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
|
|
||||||
success = False
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
if success:
|
|
||||||
print("✓ PASS: Model correctly used tool response data")
|
|
||||||
else:
|
|
||||||
print("✗ FAIL: Model did not use tool response correctly")
|
|
||||||
for issue in issues:
|
|
||||||
print(f" - {issue}")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": success,
|
|
||||||
"issues": issues,
|
|
||||||
"final_response": final_response
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_tool_response_with_debug_info():
|
|
||||||
"""
|
|
||||||
Test with detailed logging to capture exactly what the model sees.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_time",
|
|
||||||
"description": "Get the current time",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {},
|
|
||||||
"required": []
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TEST: Tool response with debug info (non-streaming)")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{"role": "user", "content": "What time is it?"}
|
|
||||||
]
|
|
||||||
|
|
||||||
with httpx.Client(timeout=120.0) as client:
|
|
||||||
# Get tool call
|
|
||||||
print(f"[{timestamp()}] Sending initial request...")
|
|
||||||
response = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"tool_choice": "auto",
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
|
|
||||||
print("No tool call - skipping test")
|
|
||||||
return
|
|
||||||
|
|
||||||
tool_call = result["choices"][0]["message"]["tool_calls"][0]
|
|
||||||
tc_id = tool_call["id"]
|
|
||||||
|
|
||||||
print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
|
|
||||||
print(f"[{timestamp()}] Tool call ID: {tc_id}")
|
|
||||||
|
|
||||||
# Add tool response
|
|
||||||
messages.append({
|
|
||||||
"role": "assistant",
|
|
||||||
"tool_calls": [tool_call]
|
|
||||||
})
|
|
||||||
messages.append({
|
|
||||||
"role": "tool",
|
|
||||||
"tool_call_id": tc_id,
|
|
||||||
"content": "The current time is 3:45 PM on Thursday, April 9, 2026."
|
|
||||||
})
|
|
||||||
|
|
||||||
# Debug: print the full messages array we're about to send
|
|
||||||
print(f"\n[{timestamp()}] Sending follow-up with these messages:")
|
|
||||||
print(json.dumps(messages, indent=2))
|
|
||||||
|
|
||||||
# Get follow-up
|
|
||||||
response2 = client.post(
|
|
||||||
f"{API_BASE}/chat/completions",
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {API_KEY}",
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"model": MODEL,
|
|
||||||
"messages": messages,
|
|
||||||
"tools": tools,
|
|
||||||
"stream": False,
|
|
||||||
"max_tokens": 256,
|
|
||||||
"chat_template_kwargs": {"enable_thinking": False},
|
|
||||||
"logprobs": True,
|
|
||||||
"top_logprobs": 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result2 = response2.json()
|
|
||||||
print(f"\n[{timestamp()}] Full response:")
|
|
||||||
print(json.dumps(result2, indent=2))
|
|
||||||
|
|
||||||
if result2.get("choices"):
|
|
||||||
content = result2["choices"][0].get("message", {}).get("content", "")
|
|
||||||
|
|
||||||
print(f"\n[{timestamp()}] Model response content: {content}")
|
|
||||||
|
|
||||||
# Check if time is mentioned
|
|
||||||
if "3:45" in content or "3:45 PM" in content:
|
|
||||||
print("\n✓ Model used the tool response (time mentioned)")
|
|
||||||
else:
|
|
||||||
print("\n✗ Model may not have seen the tool response (time not mentioned)")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("GLM-5.1 Tool Call Response Tests")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
# Test non-streaming first (simpler to debug)
|
|
||||||
print("\n--- Test 1: Non-streaming tool response flow ---")
|
|
||||||
test_tool_call_response_flow(streaming=False)
|
|
||||||
|
|
||||||
# Test streaming
|
|
||||||
print("\n--- Test 2: Streaming tool response flow ---")
|
|
||||||
test_tool_call_response_flow(streaming=True)
|
|
||||||
|
|
||||||
# Debug test
|
|
||||||
print("\n--- Test 3: Debug info test ---")
|
|
||||||
test_tool_response_with_debug_info()
|
|
||||||
|
|
||||||
print("\nAll tests complete.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user