consolidate to run_suite.py: single pluggable test suite, all models 84/84
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1 +1,3 @@
|
||||
.env
|
||||
models.env
|
||||
__pycache__/
|
||||
|
||||
815
run_suite.py
Normal file
815
run_suite.py
Normal file
@@ -0,0 +1,815 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Universal model tool-call test suite.
|
||||
|
||||
Tests any OpenAI-compatible endpoint for:
|
||||
1. Basic chat (non-streaming + streaming)
|
||||
2. Tool calls (non-streaming + streaming)
|
||||
3. Multi-turn tool response flow (non-streaming + streaming)
|
||||
4. Nested/bad tool schema handling (SGLang compatibility)
|
||||
5. Streaming tool call chunking (are args actually streamed?)
|
||||
6. Param sweep (what vLLM params does the endpoint accept?)
|
||||
|
||||
Handles reasoning models (content in 'reasoning' field, null 'content'),
|
||||
different finish_reason values, and empty/tool_calls arrays gracefully.
|
||||
|
||||
Usage:
|
||||
TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py
|
||||
python3 run_suite.py --all
|
||||
python3 run_suite.py --model 1
|
||||
python3 run_suite.py --filter Devstral
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import httpx
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────
|
||||
|
||||
def ts():
|
||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
|
||||
def safe_choice(body: dict, index: int = 0) -> dict:
|
||||
"""Safely get a choice from a response body."""
|
||||
choices = body.get("choices") or []
|
||||
if index < len(choices):
|
||||
return choices[index]
|
||||
return {}
|
||||
|
||||
|
||||
def safe_message(body: dict) -> dict:
|
||||
"""Safely get the message from the first choice."""
|
||||
return safe_choice(body).get("message") or {}
|
||||
|
||||
|
||||
def safe_delta(chunk: dict) -> dict:
|
||||
"""Safely get the delta from the first choice of a streaming chunk."""
|
||||
choices = chunk.get("choices") or []
|
||||
if choices:
|
||||
return choices[0].get("delta") or {}
|
||||
return {}
|
||||
|
||||
|
||||
def extract_content(msg: dict) -> tuple[str, str]:
|
||||
"""Extract (content, reasoning) from a message, handling nulls."""
|
||||
content = msg.get("content") or ""
|
||||
reasoning = msg.get("reasoning") or ""
|
||||
return content, reasoning
|
||||
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ModelConfig:
|
||||
api_base: str
|
||||
api_key: str
|
||||
model: str
|
||||
|
||||
@property
|
||||
def label(self):
|
||||
return self.model.split("/")[-1]
|
||||
|
||||
|
||||
def load_models_env(path: Path) -> list[ModelConfig]:
|
||||
"""Load models from the models.env file (pipe-delimited)."""
|
||||
configs = []
|
||||
for line in path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) >= 3:
|
||||
configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2]))
|
||||
return configs
|
||||
|
||||
|
||||
def config_from_env() -> ModelConfig | None:
|
||||
"""Get a single config from TOOLTEST_* environment variables."""
|
||||
base = os.environ.get("TOOLTEST_API_BASE")
|
||||
key = os.environ.get("TOOLTEST_API_KEY")
|
||||
model = os.environ.get("TOOLTEST_MODEL")
|
||||
if base and key and model:
|
||||
return ModelConfig(api_base=base, api_key=key, model=model)
|
||||
return None
|
||||
|
||||
|
||||
# ── Test result types ────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
name: str
|
||||
passed: bool
|
||||
detail: str = ""
|
||||
duration_s: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuiteResult:
|
||||
model: str
|
||||
results: list[TestResult] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def passed(self):
|
||||
return sum(1 for r in self.results if r.passed)
|
||||
|
||||
@property
|
||||
def total(self):
|
||||
return len(self.results)
|
||||
|
||||
|
||||
def make_client(cfg: ModelConfig) -> httpx.Client:
|
||||
return httpx.Client(
|
||||
timeout=120.0,
|
||||
headers={
|
||||
"Authorization": f"Bearer {cfg.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ── Shared tool definitions ──────────────────────────────────
|
||||
|
||||
WEATHER_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WRITE_FILE_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "write_file",
|
||||
"description": "Write content to a file.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filename": {"type": "string", "description": "Name of the file"},
|
||||
"content": {"type": "string", "description": "The content to write"}
|
||||
},
|
||||
"required": ["filename", "content"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BAD_SCHEMA_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": [] # Invalid — should be {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NESTED_BAD_SCHEMA_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "message",
|
||||
"description": "Send a message",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"fields": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": [] # Invalid — should be {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# ── Test functions ───────────────────────────────────────────
|
||||
|
||||
def test_basic_nonstream(cfg: ModelConfig) -> TestResult:
|
||||
"""1. Basic non-streaming chat."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "Say hello in one word."}],
|
||||
"stream": False,
|
||||
"max_tokens": 64,
|
||||
})
|
||||
body = r.json()
|
||||
dur = time.time() - start
|
||||
if r.status_code != 200:
|
||||
return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
|
||||
content, reasoning = extract_content(safe_message(body))
|
||||
fr = safe_choice(body).get("finish_reason", "?")
|
||||
if content:
|
||||
return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur)
|
||||
elif reasoning:
|
||||
return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur)
|
||||
else:
|
||||
return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur)
|
||||
except Exception as e:
|
||||
return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_basic_stream(cfg: ModelConfig) -> TestResult:
|
||||
"""2. Basic streaming chat."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
|
||||
"stream": True,
|
||||
"max_tokens": 64,
|
||||
}) as r:
|
||||
if r.status_code != 200:
|
||||
body = "".join(r.iter_lines())
|
||||
dur = time.time() - start
|
||||
return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur)
|
||||
full_content = ""
|
||||
full_reasoning = ""
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = safe_delta(chunk)
|
||||
if delta.get("content"):
|
||||
full_content += delta["content"]
|
||||
if delta.get("reasoning"):
|
||||
full_reasoning += delta["reasoning"]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
dur = time.time() - start
|
||||
if full_content:
|
||||
return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur)
|
||||
elif full_reasoning:
|
||||
return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur)
|
||||
else:
|
||||
return TestResult("basic stream", False, "No content or reasoning received", dur)
|
||||
except Exception as e:
|
||||
return TestResult("basic stream", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult:
|
||||
"""3. Tool call — non-streaming."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||
"tools": [WEATHER_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
body = r.json()
|
||||
dur = time.time() - start
|
||||
if r.status_code != 200:
|
||||
return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
|
||||
msg = safe_message(body)
|
||||
tool_calls = msg.get("tool_calls") or []
|
||||
if tool_calls:
|
||||
tc = tool_calls[0]
|
||||
fn = tc.get("function", {})
|
||||
return TestResult("tool call non-stream", True,
|
||||
f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur)
|
||||
else:
|
||||
content, reasoning = extract_content(msg)
|
||||
out = content or reasoning or "(empty)"
|
||||
return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur)
|
||||
except Exception as e:
|
||||
return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_toolcall_stream(cfg: ModelConfig) -> TestResult:
|
||||
"""4. Tool call — streaming."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||
"tools": [WEATHER_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 256,
|
||||
}) as r:
|
||||
if r.status_code != 200:
|
||||
body = "".join(r.iter_lines())
|
||||
dur = time.time() - start
|
||||
return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur)
|
||||
tool_name = None
|
||||
accumulated_args = ""
|
||||
content_parts = ""
|
||||
reasoning_parts = ""
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = safe_delta(chunk)
|
||||
tc_list = delta.get("tool_calls") or []
|
||||
for tc in tc_list:
|
||||
fn = tc.get("function") or {}
|
||||
if fn.get("name"):
|
||||
tool_name = fn["name"]
|
||||
if fn.get("arguments"):
|
||||
accumulated_args += fn["arguments"]
|
||||
if delta.get("content"):
|
||||
content_parts += delta["content"]
|
||||
if delta.get("reasoning"):
|
||||
reasoning_parts += delta["reasoning"]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
dur = time.time() - start
|
||||
if tool_name:
|
||||
return TestResult("tool call stream", True,
|
||||
f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur)
|
||||
else:
|
||||
out = content_parts or reasoning_parts or "(empty)"
|
||||
return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur)
|
||||
except Exception as e:
|
||||
return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult:
|
||||
"""5/6. Full tool call → response → follow-up flow."""
|
||||
label = "tool response flow (stream)" if streaming else "tool response flow"
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
|
||||
|
||||
# Step 1: Get tool call
|
||||
if not streaming:
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": messages,
|
||||
"tools": [WEATHER_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
|
||||
msg = safe_message(body)
|
||||
else:
|
||||
tool_name = None
|
||||
tool_id = None
|
||||
accumulated_args = ""
|
||||
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": messages,
|
||||
"tools": [WEATHER_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 256,
|
||||
}) as r:
|
||||
if r.status_code != 200:
|
||||
return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = safe_delta(chunk)
|
||||
for tc in (delta.get("tool_calls") or []):
|
||||
if tc.get("id"):
|
||||
tool_id = tc["id"]
|
||||
fn = tc.get("function") or {}
|
||||
if fn.get("name"):
|
||||
tool_name = fn["name"]
|
||||
if fn.get("arguments"):
|
||||
accumulated_args += fn["arguments"]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if not tool_name:
|
||||
return TestResult(label, False, "No tool call in step 1", time.time() - start)
|
||||
msg = {
|
||||
"role": "assistant",
|
||||
"tool_calls": [{
|
||||
"id": tool_id or "call_0",
|
||||
"type": "function",
|
||||
"function": {"name": tool_name, "arguments": accumulated_args}
|
||||
}]
|
||||
}
|
||||
|
||||
tool_calls = msg.get("tool_calls") or []
|
||||
if not tool_calls:
|
||||
return TestResult(label, False, "No tool call in step 1", time.time() - start)
|
||||
|
||||
tc = tool_calls[0]
|
||||
tc_id = tc.get("id", "call_0")
|
||||
|
||||
# Step 2: Send tool response
|
||||
messages.append(msg)
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
|
||||
})
|
||||
|
||||
# Step 3: Get follow-up
|
||||
r2 = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": messages,
|
||||
"tools": [WEATHER_TOOL],
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
body2 = r2.json()
|
||||
dur = time.time() - start
|
||||
if r2.status_code != 200:
|
||||
return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur)
|
||||
|
||||
final_msg = safe_message(body2)
|
||||
final_content, final_reasoning = extract_content(final_msg)
|
||||
final = final_content or final_reasoning or ""
|
||||
|
||||
# Check the model actually used the tool data
|
||||
ok = "22" in final
|
||||
indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"]
|
||||
for ind in indicators:
|
||||
if ind in final.lower():
|
||||
ok = False
|
||||
break
|
||||
if not final_content and final_reasoning:
|
||||
return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur)
|
||||
return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur)
|
||||
except Exception as e:
|
||||
return TestResult(label, False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_bad_tool_schema(cfg: ModelConfig) -> TestResult:
|
||||
"""7. OpenClaw-style tool with properties=[] (tests schema validation/middleware)."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "Search for cats"}],
|
||||
"tools": [BAD_SCHEMA_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 128,
|
||||
})
|
||||
body = r.json()
|
||||
dur = time.time() - start
|
||||
if r.status_code != 200:
|
||||
err = ""
|
||||
try:
|
||||
err = body.get("error", {}).get("message", "")[:150]
|
||||
except Exception:
|
||||
err = json.dumps(body)[:150]
|
||||
return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
|
||||
return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur)
|
||||
except Exception as e:
|
||||
return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_nested_bad_schema(cfg: ModelConfig) -> TestResult:
|
||||
"""8. Nested properties=[] inside items (the Tool 21 bug)."""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "Send a message to Bob"}],
|
||||
"tools": [NESTED_BAD_SCHEMA_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 128,
|
||||
})
|
||||
body = r.json()
|
||||
dur = time.time() - start
|
||||
if r.status_code != 200:
|
||||
err = ""
|
||||
try:
|
||||
err = body.get("error", {}).get("message", "")[:150]
|
||||
except Exception:
|
||||
err = json.dumps(body)[:150]
|
||||
return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
|
||||
return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur)
|
||||
except Exception as e:
|
||||
return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult:
|
||||
"""9. Streaming tool call chunking — are args actually streamed in multiple chunks?"""
|
||||
with make_client(cfg) as c:
|
||||
start = time.time()
|
||||
try:
|
||||
with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
|
||||
"model": cfg.model,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": "Write a Python hello world and save it using the write_file tool."
|
||||
}],
|
||||
"tools": [WRITE_FILE_TOOL],
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 1024,
|
||||
}) as r:
|
||||
if r.status_code != 200:
|
||||
dur = time.time() - start
|
||||
return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur)
|
||||
|
||||
tool_name = None
|
||||
arg_chunks = 0
|
||||
accumulated_args = ""
|
||||
content_chunks = 0
|
||||
reasoning_chunks = 0
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = safe_delta(chunk)
|
||||
for tc in (delta.get("tool_calls") or []):
|
||||
fn = tc.get("function") or {}
|
||||
if fn.get("name"):
|
||||
tool_name = fn["name"]
|
||||
if fn.get("arguments"):
|
||||
arg_chunks += 1
|
||||
accumulated_args += fn["arguments"]
|
||||
if delta.get("content"):
|
||||
content_chunks += 1
|
||||
if delta.get("reasoning"):
|
||||
reasoning_chunks += 1
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
dur = time.time() - start
|
||||
if not tool_name:
|
||||
if content_chunks > 0 or reasoning_chunks > 0:
|
||||
return TestResult("streaming tool chunking", False,
|
||||
f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur)
|
||||
return TestResult("streaming tool chunking", False, "No tool call and no content", dur)
|
||||
|
||||
# Evaluate chunking quality
|
||||
if arg_chunks > 1:
|
||||
return TestResult("streaming tool chunking", True,
|
||||
f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur)
|
||||
elif arg_chunks == 1 and len(accumulated_args) > 500:
|
||||
return TestResult("streaming tool chunking", False,
|
||||
f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur)
|
||||
elif arg_chunks == 1:
|
||||
return TestResult("streaming tool chunking", True,
|
||||
f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur)
|
||||
else:
|
||||
return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur)
|
||||
except Exception as e:
|
||||
return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start)
|
||||
|
||||
|
||||
def test_param_sweep(cfg: ModelConfig) -> list[TestResult]:
|
||||
"""10. Parameter sweep — which vLLM params does the endpoint accept?"""
|
||||
results = []
|
||||
base_req = {
|
||||
"model": cfg.model,
|
||||
"messages": [{"role": "user", "content": "Say hi."}],
|
||||
"stream": False,
|
||||
"max_tokens": 32,
|
||||
}
|
||||
extra_params = [
|
||||
("chat_template_kwargs", {"enable_thinking": False}),
|
||||
("guided_json", None),
|
||||
("guided_regex", None),
|
||||
("response_format", {"type": "json_object"}),
|
||||
("n", 1),
|
||||
("presence_penalty", 0.0),
|
||||
("frequency_penalty", 0.0),
|
||||
("top_p", 1.0),
|
||||
("temperature", 0.7),
|
||||
("seed", 42),
|
||||
("stop", ["\n"]),
|
||||
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
|
||||
]
|
||||
|
||||
with make_client(cfg) as c:
|
||||
for name, val in extra_params:
|
||||
start = time.time()
|
||||
try:
|
||||
if isinstance(val, dict):
|
||||
req = {**base_req, **val}
|
||||
else:
|
||||
req = {**base_req, name: val}
|
||||
r = c.post(f"{cfg.api_base}/chat/completions", json=req)
|
||||
dur = time.time() - start
|
||||
ok = r.status_code == 200
|
||||
detail = f"HTTP {r.status_code}"
|
||||
if not ok:
|
||||
try:
|
||||
detail += f": {r.json().get('error', {}).get('message', '')[:80]}"
|
||||
except Exception:
|
||||
pass
|
||||
results.append(TestResult(f"param: {name}", ok, detail, dur))
|
||||
except Exception as e:
|
||||
results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ── Suite runner ─────────────────────────────────────────────
|
||||
|
||||
ALL_TESTS = [
|
||||
test_basic_nonstream,
|
||||
test_basic_stream,
|
||||
test_toolcall_nonstream,
|
||||
test_toolcall_stream,
|
||||
lambda cfg: test_tool_response_flow(cfg, streaming=False),
|
||||
lambda cfg: test_tool_response_flow(cfg, streaming=True),
|
||||
test_bad_tool_schema,
|
||||
test_nested_bad_schema,
|
||||
test_streaming_tool_chunks,
|
||||
]
|
||||
|
||||
|
||||
def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult:
|
||||
"""Run the full test suite against one model config."""
|
||||
result = SuiteResult(model=cfg.model)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {cfg.model}")
|
||||
print(f"API: {cfg.api_base}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for test_fn in ALL_TESTS:
|
||||
name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__
|
||||
if verbose:
|
||||
print(f"\n[{ts()}] Running: {name}...")
|
||||
|
||||
tr = test_fn(cfg)
|
||||
if isinstance(tr, list):
|
||||
result.results.extend(tr)
|
||||
else:
|
||||
result.results.append(tr)
|
||||
|
||||
if verbose:
|
||||
if isinstance(tr, list):
|
||||
for r in tr:
|
||||
s = "✓" if r.passed else "✗"
|
||||
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
|
||||
else:
|
||||
s = "✓" if tr.passed else "✗"
|
||||
print(f" {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)")
|
||||
|
||||
# Param sweep
|
||||
if verbose:
|
||||
print(f"\n[{ts()}] Running: parameter sweep...")
|
||||
sweep_results = test_param_sweep(cfg)
|
||||
result.results.extend(sweep_results)
|
||||
if verbose:
|
||||
for r in sweep_results:
|
||||
s = "✓" if r.passed else "✗"
|
||||
print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def print_summary(results: list[SuiteResult]):
|
||||
"""Print a final summary across all models."""
|
||||
print(f"\n\n{'='*60}")
|
||||
print("FINAL SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for sr in results:
|
||||
passed = sr.passed
|
||||
total = sr.total
|
||||
pct = (passed / total * 100) if total else 0
|
||||
label = sr.model.split("/")[-1]
|
||||
print(f"\n {label}: {passed}/{total} passed ({pct:.0f}%)")
|
||||
|
||||
for r in sr.results:
|
||||
if not r.passed:
|
||||
print(f" ✗ {r.name}: {r.detail[:80]}")
|
||||
|
||||
# Cross-model comparison for key tests
|
||||
print(f"\n{'─'*60}")
|
||||
print("CROSS-MODEL COMPARISON")
|
||||
print(f"{'─'*60}")
|
||||
key_tests = [
|
||||
"basic non-stream",
|
||||
"basic stream",
|
||||
"tool call non-stream",
|
||||
"tool call stream",
|
||||
"tool response flow",
|
||||
"tool response flow (stream)",
|
||||
"streaming tool chunking",
|
||||
"bad tool schema (properties=[])",
|
||||
"nested bad schema (items.properties=[])",
|
||||
]
|
||||
|
||||
# Calculate column width
|
||||
labels = [sr.model.split("/")[-1][:18] for sr in results]
|
||||
col_w = max(len(l) for l in labels) if labels else 16
|
||||
col_w = max(col_w, 16)
|
||||
|
||||
header = f"{'Test':<40}"
|
||||
for l in labels:
|
||||
header += f" {l:>{col_w}}"
|
||||
print(header)
|
||||
print("─" * len(header))
|
||||
|
||||
for test_name in key_tests:
|
||||
row = f"{test_name:<40}"
|
||||
for sr in results:
|
||||
match = [r for r in sr.results if r.name == test_name]
|
||||
if match:
|
||||
status = "✓" if match[0].passed else "✗"
|
||||
row += f" {status:>{col_w}}"
|
||||
else:
|
||||
row += f" {'—':>{col_w}}"
|
||||
print(row)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
|
||||
|
||||
# ── CLI ──────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Universal model tool-call test suite")
|
||||
parser.add_argument("--all", action="store_true", help="Test all models from models.env")
|
||||
parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env")
|
||||
parser.add_argument("--filter", type=str, help="Test models matching substring")
|
||||
parser.add_argument("--quiet", action="store_true", help="Less output per test")
|
||||
args = parser.parse_args()
|
||||
|
||||
models_path = Path(__file__).parent / "models.env"
|
||||
|
||||
configs: list[ModelConfig] = []
|
||||
|
||||
if args.all:
|
||||
if not models_path.exists():
|
||||
print("ERROR: models.env not found")
|
||||
sys.exit(1)
|
||||
configs = load_models_env(models_path)
|
||||
elif args.model:
|
||||
if not models_path.exists():
|
||||
print("ERROR: models.env not found")
|
||||
sys.exit(1)
|
||||
all_configs = load_models_env(models_path)
|
||||
if args.model < 1 or args.model > len(all_configs):
|
||||
print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})")
|
||||
sys.exit(1)
|
||||
configs = [all_configs[args.model - 1]]
|
||||
elif args.filter:
|
||||
if not models_path.exists():
|
||||
print("ERROR: models.env not found")
|
||||
sys.exit(1)
|
||||
all_configs = load_models_env(models_path)
|
||||
configs = [c for c in all_configs if args.filter.lower() in c.model.lower()]
|
||||
if not configs:
|
||||
print(f"No models matching '{args.filter}'")
|
||||
sys.exit(1)
|
||||
else:
|
||||
cfg = config_from_env()
|
||||
if cfg:
|
||||
configs = [cfg]
|
||||
else:
|
||||
print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.")
|
||||
if models_path.exists():
|
||||
print("\nAvailable models from models.env:")
|
||||
for i, c in enumerate(load_models_env(models_path), 1):
|
||||
print(f" {i}. {c.model} @ {c.api_base}")
|
||||
sys.exit(1)
|
||||
|
||||
all_results: list[SuiteResult] = []
|
||||
for cfg in configs:
|
||||
sr = run_suite(cfg, verbose=not args.quiet)
|
||||
all_results.append(sr)
|
||||
|
||||
print_summary(all_results)
|
||||
|
||||
if any(sr.passed < sr.total for sr in all_results):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
23
run_tests.sh
23
run_tests.sh
@@ -1,19 +1,14 @@
|
||||
#!/bin/bash
|
||||
# Run the streaming tool call tests
|
||||
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Default values
|
||||
export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}"
|
||||
export VLLM_API_KEY="${VLLM_API_KEY:-none}"
|
||||
export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}"
|
||||
# Usage:
|
||||
# ./run_tests.sh # Test all models from models.env
|
||||
# ./run_tests.sh --model 1 # Test model #1
|
||||
# ./run_tests.sh --filter Devstral # Test matching models
|
||||
# ./run_tests.sh --all # Same as no args
|
||||
# ./run_tests.sh --quiet # Less output
|
||||
|
||||
echo "Configuration:"
|
||||
echo " API_BASE: $VLLM_API_BASE"
|
||||
echo " MODEL: $VLLM_MODEL"
|
||||
echo ""
|
||||
|
||||
# Run the test
|
||||
python3 "$SCRIPT_DIR/test_streaming_tool_calls.py"
|
||||
cd "$SCRIPT_DIR"
|
||||
python3 -u run_suite.py "$@"
|
||||
|
||||
546
test_devstral.py
546
test_devstral.py
@@ -1,546 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
|
||||
|
||||
These tests send EXACTLY what OpenClaw would send to vLLM — including
|
||||
chat_template_kwargs, logprobs, weird tool schemas, the works.
|
||||
The middleware's job is to strip/fix all of it so SGLang doesn't choke.
|
||||
|
||||
Architecture: this test → middleware (strips bad params) → SGLang
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import httpx
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Load .env if present (don't hardcode keys)
|
||||
_env_file = Path(__file__).parent / ".env"
|
||||
if _env_file.exists():
|
||||
for line in _env_file.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip())
|
||||
|
||||
API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
|
||||
API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
|
||||
MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
|
||||
|
||||
RESULTS = []
|
||||
|
||||
|
||||
def ts():
|
||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
|
||||
def record(name, ok, detail=""):
|
||||
status = "✓ PASS" if ok else "✗ FAIL"
|
||||
print(f"\n{status}: {name}")
|
||||
if detail:
|
||||
print(f" {detail}")
|
||||
RESULTS.append({"name": name, "pass": ok, "detail": detail})
|
||||
|
||||
|
||||
def make_client():
|
||||
return httpx.Client(
|
||||
timeout=120.0,
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ── 1. Basic non-streaming chat ──────────────────────────────
|
||||
|
||||
def test_basic_nonstream():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Basic non-streaming chat")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "Say hello in one word."}],
|
||||
"stream": False,
|
||||
"max_tokens": 32,
|
||||
})
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
||||
record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
||||
return
|
||||
content = body["choices"][0]["message"]["content"]
|
||||
print(f"[{ts()}] Reply: {content[:100]}")
|
||||
record("basic non-stream", True, f"Got: {content[:80]}")
|
||||
|
||||
|
||||
# ── 2. Basic streaming chat ──────────────────────────────────
|
||||
|
||||
def test_basic_stream():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Basic streaming chat")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with make_client() as c:
|
||||
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 5."}],
|
||||
"stream": True,
|
||||
"max_tokens": 64,
|
||||
}) as r:
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
if r.status_code != 200:
|
||||
body = "".join(r.iter_lines())
|
||||
print(f"[{ts()}] Error: {body[:300]}")
|
||||
record("basic stream", False, f"HTTP {r.status_code}")
|
||||
return
|
||||
full = ""
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
if not chunk.get("choices"): continue
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
if delta.get("content"):
|
||||
full += delta["content"]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
print(f"[{ts()}] Reply: {full[:100]}")
|
||||
record("basic stream", True, f"Got: {full[:80]}")
|
||||
|
||||
|
||||
# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
|
||||
|
||||
def test_toolcall_nonstream():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
|
||||
record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
||||
return
|
||||
msg = body["choices"][0]["message"]
|
||||
if msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"][0]
|
||||
print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
|
||||
record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
|
||||
else:
|
||||
content = msg.get("content", "")
|
||||
print(f"[{ts()}] No tool call. Content: {content[:200]}")
|
||||
record("tool call non-stream", False, "Model did not call the tool")
|
||||
|
||||
|
||||
# ── 4. Tool call — streaming ────────────────────────────────
|
||||
|
||||
def test_toolcall_stream():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Tool call streaming")
|
||||
print(f"{'='*60}")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
with make_client() as c:
|
||||
with c.stream("POST", f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 256,
|
||||
}) as r:
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
if r.status_code != 200:
|
||||
body = "".join(r.iter_lines())
|
||||
print(f"[{ts()}] Error: {body[:300]}")
|
||||
record("tool call stream", False, f"HTTP {r.status_code}")
|
||||
return
|
||||
tool_name = None
|
||||
accumulated_args = ""
|
||||
content_parts = ""
|
||||
for line in r.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
if not chunk.get("choices"): continue
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
if delta.get("tool_calls"):
|
||||
for tc in delta["tool_calls"]:
|
||||
if tc.get("function", {}).get("name"):
|
||||
tool_name = tc["function"]["name"]
|
||||
if tc.get("function", {}).get("arguments"):
|
||||
accumulated_args += tc["function"]["arguments"]
|
||||
if delta.get("content"):
|
||||
content_parts += delta["content"]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if tool_name:
|
||||
print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
|
||||
record("tool call stream", True, f"Got tool call: {tool_name}")
|
||||
else:
|
||||
print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
|
||||
record("tool call stream", False, "Model did not call the tool")
|
||||
|
||||
|
||||
# ── 5. Full tool response flow (non-streaming) ──────────────
|
||||
|
||||
def test_tool_response_flow():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
|
||||
return
|
||||
msg = body["choices"][0]["message"]
|
||||
if not msg.get("tool_calls"):
|
||||
record("tool response flow", False, "No tool call in step 1")
|
||||
return
|
||||
|
||||
tc = msg["tool_calls"][0]
|
||||
tc_id = tc["id"]
|
||||
print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
|
||||
|
||||
messages.append(msg)
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
|
||||
})
|
||||
|
||||
r2 = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
})
|
||||
body2 = r2.json()
|
||||
if r2.status_code != 200:
|
||||
print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
|
||||
record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
|
||||
return
|
||||
|
||||
final = body2["choices"][0]["message"].get("content", "")
|
||||
print(f"[{ts()}] Final: {final[:200]}")
|
||||
ok = "22" in final
|
||||
record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
|
||||
|
||||
|
||||
# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
|
||||
|
||||
def test_param_sweep():
|
||||
"""
|
||||
Sends EVERY param that OpenClaw or vLLM might include.
|
||||
The middleware must strip/fix the ones SGLang rejects.
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
base_req = {
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "Say hi."}],
|
||||
"stream": False,
|
||||
"max_tokens": 32,
|
||||
}
|
||||
|
||||
# Params that OpenClaw/vLLM might send — some SGLang rejects
|
||||
extra_params = [
|
||||
("chat_template_kwargs", {"enable_thinking": False}),
|
||||
("guided_json", None),
|
||||
("guided_regex", None),
|
||||
("response_format", {"type": "json_object"}),
|
||||
("n", 1),
|
||||
("presence_penalty", 0.0),
|
||||
("frequency_penalty", 0.0),
|
||||
("top_p", 1.0),
|
||||
("temperature", 0.7),
|
||||
("seed", 42),
|
||||
("stop", ["\n"]),
|
||||
("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
|
||||
("top_logprobs", 5),
|
||||
]
|
||||
|
||||
with make_client() as c:
|
||||
# baseline
|
||||
r = c.post(f"{API_BASE}/chat/completions", json=base_req)
|
||||
print(f"[{ts()}] Baseline: {r.status_code}")
|
||||
|
||||
for name, val in extra_params:
|
||||
req = {**base_req, name: val}
|
||||
r = c.post(f"{API_BASE}/chat/completions", json=req)
|
||||
status = "✓" if r.status_code == 200 else "✗"
|
||||
detail = ""
|
||||
if r.status_code != 200:
|
||||
try:
|
||||
detail = r.json().get("error", {}).get("message", "")[:100]
|
||||
except Exception:
|
||||
detail = r.text[:100]
|
||||
print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
|
||||
if r.status_code != 200:
|
||||
record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
|
||||
|
||||
|
||||
# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
|
||||
|
||||
def test_openclaw_tool_schema():
|
||||
"""
|
||||
Reproduce the exact tool schema that OpenClaw sends which has
|
||||
parameters.properties = [] instead of {}. Middleware must fix it.
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# This is the exact shape OpenClaw sends for tools with no params
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": [] # <-- THIS is what causes the 400
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "Search for cats"}],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 128,
|
||||
})
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
||||
record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
||||
return
|
||||
print(f"[{ts()}] Success — middleware fixed the bad schema")
|
||||
record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
|
||||
|
||||
|
||||
# ── 8. Nested properties=[] in tool schema (Tool 21 bug) ────
|
||||
|
||||
def test_nested_bad_properties():
|
||||
"""
|
||||
Reproduce the exact Tool 21 400 error:
|
||||
schema['properties']['fields']['items']['properties'] = []
|
||||
|
||||
This happens when a tool has an array-of-objects parameter where
|
||||
the items' properties field is [] instead of {}. The middleware
|
||||
must recurse into the schema to fix ALL properties fields.
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema:
|
||||
# [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']"
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "message",
|
||||
"description": "Send a message",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"fields": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": [] # <-- THIS causes the 400
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [{"role": "user", "content": "Send a message to Bob"}],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 128,
|
||||
})
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}")
|
||||
record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
||||
return
|
||||
print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}")
|
||||
record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}")
|
||||
|
||||
|
||||
# ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─
|
||||
|
||||
def test_openclaw_full_payload():
|
||||
"""
|
||||
The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
|
||||
Exactly what OpenClaw sends through the pipe.
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": [] # Bad — middleware must fix
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
with make_client() as c:
|
||||
r = c.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Search for the weather in NYC"},
|
||||
],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
"chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip
|
||||
"logprobs": True, # Bad — middleware must strip
|
||||
"top_logprobs": 5, # Bad — middleware must strip
|
||||
})
|
||||
print(f"[{ts()}] Status: {r.status_code}")
|
||||
body = r.json()
|
||||
if r.status_code != 200:
|
||||
print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
|
||||
record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
|
||||
return
|
||||
msg = body["choices"][0]["message"]
|
||||
print(f"[{ts()}] Success — middleware cleaned everything")
|
||||
if msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"][0]
|
||||
print(f"[{ts()}] Tool call: {tc['function']['name']}")
|
||||
else:
|
||||
print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
|
||||
record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
|
||||
print(f"API: {API_BASE}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
test_basic_nonstream()
|
||||
test_basic_stream()
|
||||
test_toolcall_nonstream()
|
||||
test_toolcall_stream()
|
||||
test_tool_response_flow()
|
||||
test_param_sweep()
|
||||
test_openclaw_tool_schema()
|
||||
test_nested_bad_properties()
|
||||
test_openclaw_full_payload()
|
||||
|
||||
print(f"\n\n{'='*60}")
|
||||
print("FINAL RESULTS")
|
||||
print(f"{'='*60}")
|
||||
for r in RESULTS:
|
||||
s = "✓" if r["pass"] else "✗"
|
||||
print(f" {s} {r['name']}: {r['detail']}")
|
||||
passed = sum(1 for r in RESULTS if r["pass"])
|
||||
print(f"\n {passed}/{len(RESULTS)} passed")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,395 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test suite for vLLM GLM-5.1 streaming tool calls.
|
||||
|
||||
Reproduces the issue where long string parameters in tool calls
|
||||
are buffered entirely before being emitted during streaming.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import httpx
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
# Configuration - will be set via environment or direct assignment
|
||||
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
|
||||
API_KEY = os.environ.get("VLLM_API_KEY", "none")
|
||||
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
|
||||
|
||||
|
||||
def timestamp():
|
||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_code():
|
||||
"""
|
||||
Test streaming a tool call with a long string parameter.
|
||||
|
||||
This prompts the model to generate code via a tool call,
|
||||
which should stream incrementally if the patch works correctly.
|
||||
"""
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "write_file",
|
||||
"description": "Write content to a file. Use this to save code, text, or other content.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"description": "Name of the file to write"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The content to write to the file"
|
||||
}
|
||||
},
|
||||
"required": ["filename", "content"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool."
|
||||
}
|
||||
]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: Streaming tool call with long string parameter")
|
||||
print(f"API: {API_BASE}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Track streaming events
|
||||
chunks_received = []
|
||||
first_chunk_time = None
|
||||
last_chunk_time = None
|
||||
tool_call_chunks = []
|
||||
accumulated_content = ""
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
with client.stream(
|
||||
"POST",
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 4096,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
) as response:
|
||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
||||
|
||||
for line in response.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
|
||||
if line.startswith("data: "):
|
||||
chunk_data = line[6:]
|
||||
try:
|
||||
chunk = json.loads(chunk_data)
|
||||
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = time.time()
|
||||
print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)")
|
||||
|
||||
last_chunk_time = time.time()
|
||||
chunks_received.append(chunk)
|
||||
|
||||
# Extract delta content
|
||||
if chunk.get("choices"):
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
|
||||
# Check for tool calls in delta
|
||||
if delta.get("tool_calls"):
|
||||
for tc in delta["tool_calls"]:
|
||||
tc_index = tc.get("index", 0)
|
||||
tc_function = tc.get("function", {})
|
||||
|
||||
if tc_function.get("name"):
|
||||
print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}")
|
||||
|
||||
if tc_function.get("arguments"):
|
||||
args_chunk = tc_function["arguments"]
|
||||
tool_call_chunks.append(args_chunk)
|
||||
accumulated_content += args_chunk
|
||||
|
||||
# Print progress every ~500 chars
|
||||
if len(accumulated_content) % 500 < len(args_chunk):
|
||||
print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...")
|
||||
|
||||
# Regular content
|
||||
if delta.get("content"):
|
||||
print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"[{timestamp()}] JSON decode error: {e}")
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total chunks received: {len(chunks_received)}")
|
||||
print(f"Total time: {end_time - start_time:.3f}s")
|
||||
|
||||
if first_chunk_time:
|
||||
print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s")
|
||||
|
||||
if tool_call_chunks:
|
||||
print(f"Tool call chunks: {len(tool_call_chunks)}")
|
||||
print(f"Total tool call content: {len(accumulated_content)} chars")
|
||||
|
||||
# Try to parse the accumulated arguments
|
||||
print(f"\nAttempting to parse tool call arguments...")
|
||||
try:
|
||||
args = json.loads(accumulated_content)
|
||||
print(f"Successfully parsed!")
|
||||
print(f" - filename: {args.get('filename', 'N/A')}")
|
||||
print(f" - content length: {len(args.get('content', ''))} chars")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Failed to parse: {e}")
|
||||
print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}")
|
||||
|
||||
# Verdict
|
||||
print(f"\n{'='*60}")
|
||||
if len(tool_call_chunks) > 1:
|
||||
print("✓ PASS: Tool call arguments arrived in multiple chunks")
|
||||
print(f" Chunks: {len(tool_call_chunks)}, indicating incremental streaming")
|
||||
elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000:
|
||||
print("✗ FAIL: Tool call arguments arrived in a single chunk")
|
||||
print(" This indicates buffering, not true streaming")
|
||||
else:
|
||||
print("? INCONCLUSIVE: Not enough data or no tool call occurred")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return {
|
||||
"chunks_received": len(chunks_received),
|
||||
"tool_call_chunks": len(tool_call_chunks),
|
||||
"accumulated_length": len(accumulated_content),
|
||||
"total_time": end_time - start_time
|
||||
}
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_json():
|
||||
"""
|
||||
Test streaming a tool call that returns structured JSON data.
|
||||
"""
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "save_config",
|
||||
"description": "Save a configuration object",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"config": {
|
||||
"type": "object",
|
||||
"description": "Configuration object with many fields"
|
||||
}
|
||||
},
|
||||
"required": ["config"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool."
|
||||
}
|
||||
]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: Streaming tool call with nested JSON")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
tool_call_chunks = []
|
||||
accumulated_content = ""
|
||||
start_time = time.time()
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
with client.stream(
|
||||
"POST",
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 2048,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
) as response:
|
||||
for line in response.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
if chunk.get("choices"):
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
if delta.get("tool_calls"):
|
||||
for tc in delta["tool_calls"]:
|
||||
if tc.get("function", {}).get("arguments"):
|
||||
args_chunk = tc["function"]["arguments"]
|
||||
tool_call_chunks.append(args_chunk)
|
||||
accumulated_content += args_chunk
|
||||
print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})")
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars")
|
||||
print(f"Time: {end_time - start_time:.3f}s")
|
||||
|
||||
if len(tool_call_chunks) > 1:
|
||||
print("✓ PASS: Arguments streamed in multiple chunks")
|
||||
elif len(tool_call_chunks) == 1:
|
||||
print("✗ FAIL: Arguments arrived in single chunk (buffered)")
|
||||
else:
|
||||
print("? No tool call occurred")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def test_non_streaming_tool_call():
|
||||
"""
|
||||
Baseline test: non-streaming tool call for comparison.
|
||||
"""
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "write_file",
|
||||
"description": "Write content to a file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filename": {"type": "string"},
|
||||
"content": {"type": "string"}
|
||||
},
|
||||
"required": ["filename", "content"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write a simple Python hello world and save it using the write_file tool."
|
||||
}
|
||||
]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: Non-streaming tool call (baseline)")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 1024,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Status: {response.status_code}")
|
||||
print(f"Time: {end_time - start_time:.3f}s")
|
||||
|
||||
if result.get("choices"):
|
||||
message = result["choices"][0].get("message", {})
|
||||
if message.get("tool_calls"):
|
||||
for tc in message["tool_calls"]:
|
||||
print(f"Tool: {tc['function']['name']}")
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
print(f"Arguments parsed successfully")
|
||||
print(f" - filename: {args.get('filename')}")
|
||||
print(f" - content length: {len(args.get('content', ''))}")
|
||||
else:
|
||||
print("No tool call in response")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*60)
|
||||
print("vLLM GLM-5.1 Streaming Tool Call Tests")
|
||||
print("="*60)
|
||||
|
||||
# Check API connectivity
|
||||
print(f"\nChecking API at {API_BASE}...")
|
||||
try:
|
||||
with httpx.Client(timeout=10.0) as client:
|
||||
response = client.get(f"{API_BASE.replace('/v1', '')}/health")
|
||||
print(f"Health check: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not reach API - {e}")
|
||||
|
||||
# Run tests
|
||||
print("\nRunning tests...\n")
|
||||
|
||||
# Test 1: Non-streaming baseline
|
||||
test_non_streaming_tool_call()
|
||||
|
||||
# Test 2: Streaming with nested JSON
|
||||
test_streaming_tool_call_with_json()
|
||||
|
||||
# Test 3: Main test - streaming with long code
|
||||
result = test_streaming_tool_call_with_code()
|
||||
|
||||
print("\nAll tests complete.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,243 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Focused test to diagnose GLM-5.1 tool response issue.
|
||||
|
||||
The issue: Model sees tool response as blank.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import json
|
||||
|
||||
API_BASE = "http://95.179.247.150/v1"
|
||||
API_KEY = "whatever"
|
||||
MODEL = "HuggingFaceTB/SmolLM3-3B"
|
||||
|
||||
|
||||
def test_simple_tool_response():
|
||||
"""
|
||||
Minimal test: Send a tool response and see if the model can use it.
|
||||
"""
|
||||
|
||||
# Simulate a conversation where a tool was called
|
||||
messages = [
|
||||
{"role": "user", "content": "Call the test function"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [{
|
||||
"id": "call_123",
|
||||
"type": "function",
|
||||
"function": {"name": "test_func", "arguments": "{}"}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_123",
|
||||
"content": "SUCCESS: The function returned value 42"
|
||||
}
|
||||
]
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "test_func",
|
||||
"description": "A test function",
|
||||
"parameters": {"type": "object", "properties": {}}
|
||||
}
|
||||
}]
|
||||
|
||||
print("=" * 60)
|
||||
print("Request messages:")
|
||||
print(json.dumps(messages, indent=2))
|
||||
print("=" * 60)
|
||||
|
||||
with httpx.Client(timeout=60.0) as client:
|
||||
# Non-streaming to get full response
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
|
||||
print("\nFull response:")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
if result.get("choices"):
|
||||
content = result["choices"][0].get("message", {}).get("content", "")
|
||||
print("\n" + "=" * 60)
|
||||
print("Model response content:")
|
||||
print(content)
|
||||
print("=" * 60)
|
||||
|
||||
# Check if the tool result is referenced
|
||||
if "42" in content:
|
||||
print("\n✓ PASS: Model referenced the tool result (42)")
|
||||
else:
|
||||
print("\n✗ FAIL: Model did NOT reference the tool result (42)")
|
||||
|
||||
# Check for signs the model didn't see the result
|
||||
if "don't have" in content.lower() or "cannot access" in content.lower():
|
||||
print("✗ Model indicates it cannot see tool result")
|
||||
|
||||
|
||||
def test_without_tools_param():
|
||||
"""
|
||||
Test what happens if we don't pass tools in the follow-up request.
|
||||
Some APIs need tools to be passed on every request.
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "Call the test function"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [{
|
||||
"id": "call_123",
|
||||
"type": "function",
|
||||
"function": {"name": "test_func", "arguments": "{}"}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_123",
|
||||
"content": "SUCCESS: The function returned value 42"
|
||||
}
|
||||
]
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Test WITHOUT tools param in follow-up")
|
||||
print("=" * 60)
|
||||
|
||||
with httpx.Client(timeout=60.0) as client:
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
# No tools param
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
|
||||
if result.get("choices"):
|
||||
content = result["choices"][0].get("message", {}).get("content", "")
|
||||
print("Model response:", content[:200])
|
||||
|
||||
if "42" in content:
|
||||
print("✓ Model referenced the tool result")
|
||||
|
||||
|
||||
def test_different_content_formats():
|
||||
"""
|
||||
Test if the issue is with how content is formatted.
|
||||
"""
|
||||
|
||||
# Test 1: String content (standard)
|
||||
messages_string = [
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [{
|
||||
"id": "call_123",
|
||||
"type": "function",
|
||||
"function": {"name": "calc", "arguments": "{}"}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_123",
|
||||
"content": "The answer is 4"
|
||||
}
|
||||
]
|
||||
|
||||
# Test 2: Content as array (OpenAI format)
|
||||
messages_array = [
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [{
|
||||
"id": "call_123",
|
||||
"type": "function",
|
||||
"function": {"name": "calc", "arguments": "{}"}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_123",
|
||||
"content": [{"type": "text", "text": "The answer is 4"}]
|
||||
}
|
||||
]
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calc",
|
||||
"description": "Calculator",
|
||||
"parameters": {"type": "object", "properties": {}}
|
||||
}
|
||||
}]
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Test: String content vs Array content")
|
||||
print("=" * 60)
|
||||
|
||||
with httpx.Client(timeout=60.0) as client:
|
||||
for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
|
||||
print(f"\n--- {name} ---")
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": msgs,
|
||||
"tools": tools,
|
||||
"stream": False,
|
||||
"max_tokens": 128,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
if result.get("choices"):
|
||||
content = result["choices"][0].get("message", {}).get("content", "")
|
||||
print(f"Response: {content[:150]}")
|
||||
if "4" in content:
|
||||
print("✓ Referenced tool result")
|
||||
else:
|
||||
print("✗ Did NOT reference tool result")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("GLM-5.1 Tool Response Diagnosis")
|
||||
print("=" * 60)
|
||||
|
||||
test_simple_tool_response()
|
||||
test_without_tools_param()
|
||||
test_different_content_formats()
|
||||
@@ -1,463 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test for tool call response handling in GLM-5.1.
|
||||
|
||||
Tests the multi-turn flow:
|
||||
1. Send a prompt that triggers a tool call
|
||||
2. Send back the tool result
|
||||
3. Verify the model can see and use the tool response
|
||||
|
||||
This reproduces the issue where tool responses appear blank to the model.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
|
||||
API_KEY = os.environ.get("VLLM_API_KEY", "none")
|
||||
MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
|
||||
|
||||
|
||||
def timestamp():
|
||||
return datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
|
||||
def test_tool_call_response_flow(streaming: bool = True):
|
||||
"""
|
||||
Test the full tool call -> response -> follow-up flow.
|
||||
|
||||
This simulates:
|
||||
1. User asks for weather
|
||||
2. Model calls get_weather tool
|
||||
3. We send back the weather data
|
||||
4. Model should see and use that data
|
||||
"""
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City and state, e.g. 'New York, NY'"
|
||||
}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# Initial request that should trigger a tool call
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Tokyo right now?"
|
||||
}
|
||||
]
|
||||
|
||||
mode = "STREAMING" if streaming else "NON-STREAMING"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: Tool call response flow ({mode})")
|
||||
print(f"API: {API_BASE}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
# Step 1: Send initial request, expect tool call
|
||||
print(f"[{timestamp()}] Step 1: Sending initial request...")
|
||||
|
||||
if streaming:
|
||||
tool_calls = []
|
||||
tool_call_id = None
|
||||
tool_call_name = None
|
||||
accumulated_args = ""
|
||||
|
||||
with client.stream(
|
||||
"POST",
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": True,
|
||||
"max_tokens": 512,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
) as response:
|
||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
||||
|
||||
for line in response.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
if chunk.get("choices"):
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
|
||||
if delta.get("tool_calls"):
|
||||
for tc in delta["tool_calls"]:
|
||||
idx = tc.get("index", 0)
|
||||
|
||||
if tc.get("id"):
|
||||
tool_call_id = tc["id"]
|
||||
|
||||
if tc.get("function", {}).get("name"):
|
||||
tool_call_name = tc["function"]["name"]
|
||||
print(f"[{timestamp()}] Tool call: {tool_call_name}")
|
||||
|
||||
if tc.get("function", {}).get("arguments"):
|
||||
accumulated_args += tc["function"]["arguments"]
|
||||
|
||||
if delta.get("content"):
|
||||
print(f"[{timestamp()}] Content: {delta['content'][:100]}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"[{timestamp()}] JSON error: {e}")
|
||||
|
||||
if tool_call_name:
|
||||
tool_calls.append({
|
||||
"id": tool_call_id or "call_0",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_call_name,
|
||||
"arguments": accumulated_args
|
||||
}
|
||||
})
|
||||
else:
|
||||
# Non-streaming
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 512,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
print(f"[{timestamp()}] Response status: {response.status_code}")
|
||||
|
||||
tool_calls = []
|
||||
if result.get("choices"):
|
||||
message = result["choices"][0].get("message", {})
|
||||
if message.get("tool_calls"):
|
||||
tool_calls = message["tool_calls"]
|
||||
for tc in tool_calls:
|
||||
print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
|
||||
print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
|
||||
|
||||
# Check if we got a tool call
|
||||
if not tool_calls:
|
||||
print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
|
||||
return {"success": False, "reason": "no_tool_call"}
|
||||
|
||||
# Step 2: Parse tool call and prepare response
|
||||
tc = tool_calls[0]
|
||||
tc_id = tc.get("id", "call_0")
|
||||
tc_name = tc["function"]["name"]
|
||||
tc_args = json.loads(tc["function"]["arguments"])
|
||||
|
||||
print(f"\n[{timestamp()}] Step 2: Tool call received")
|
||||
print(f" Name: {tc_name}")
|
||||
print(f" Args: {tc_args}")
|
||||
|
||||
# Simulate tool execution
|
||||
tool_result = {
|
||||
"location": tc_args.get("location", "Unknown"),
|
||||
"temperature": "22°C",
|
||||
"condition": "Partly cloudy",
|
||||
"humidity": "65%",
|
||||
"wind": "15 km/h NE"
|
||||
}
|
||||
|
||||
# Step 3: Send the tool response back
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"tool_calls": tool_calls
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": json.dumps(tool_result)
|
||||
})
|
||||
|
||||
print(f"\n[{timestamp()}] Step 3: Sending tool response...")
|
||||
print(f" Tool call ID: {tc_id}")
|
||||
print(f" Tool result: {json.dumps(tool_result, indent=2)}")
|
||||
|
||||
# Step 4: Get the model's follow-up response
|
||||
if streaming:
|
||||
final_response = ""
|
||||
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
|
||||
|
||||
with client.stream(
|
||||
"POST",
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"stream": True,
|
||||
"max_tokens": 512,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
) as response:
|
||||
for line in response.iter_lines():
|
||||
if not line or line == "data: [DONE]":
|
||||
continue
|
||||
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
if chunk.get("choices"):
|
||||
delta = chunk["choices"][0].get("delta", {})
|
||||
if delta.get("content"):
|
||||
content = delta["content"]
|
||||
final_response += content
|
||||
print(f"[{timestamp()}] Content: {content}", end="", flush=True)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
print() # newline after streaming output
|
||||
else:
|
||||
print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
|
||||
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"stream": False,
|
||||
"max_tokens": 512,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
final_response = ""
|
||||
if result.get("choices"):
|
||||
final_response = result["choices"][0].get("message", {}).get("content", "")
|
||||
|
||||
print(f"\n[{timestamp()}] Final response:\n{final_response}")
|
||||
|
||||
# Check if the model used the tool data
|
||||
success = True
|
||||
issues = []
|
||||
|
||||
# The response should mention the weather data
|
||||
if "22" not in final_response and "22°C" not in final_response:
|
||||
issues.append("Temperature (22°C) not mentioned in response")
|
||||
success = False
|
||||
|
||||
if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
|
||||
issues.append("Condition (Partly cloudy) not mentioned in response")
|
||||
success = False
|
||||
|
||||
# Check for signs the model didn't see the data
|
||||
blank_indicators = [
|
||||
"i don't have",
|
||||
"i cannot access",
|
||||
"i'm unable to",
|
||||
"i am unable to",
|
||||
"don't have access",
|
||||
"don't have real-time",
|
||||
"cannot provide real-time"
|
||||
]
|
||||
|
||||
for indicator in blank_indicators:
|
||||
if indicator in final_response.lower():
|
||||
issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
|
||||
success = False
|
||||
break
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
if success:
|
||||
print("✓ PASS: Model correctly used tool response data")
|
||||
else:
|
||||
print("✗ FAIL: Model did not use tool response correctly")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"issues": issues,
|
||||
"final_response": final_response
|
||||
}
|
||||
|
||||
|
||||
def test_tool_response_with_debug_info():
|
||||
"""
|
||||
Test with detailed logging to capture exactly what the model sees.
|
||||
"""
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_time",
|
||||
"description": "Get the current time",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: Tool response with debug info (non-streaming)")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What time is it?"}
|
||||
]
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
# Get tool call
|
||||
print(f"[{timestamp()}] Sending initial request...")
|
||||
response = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
|
||||
if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
|
||||
print("No tool call - skipping test")
|
||||
return
|
||||
|
||||
tool_call = result["choices"][0]["message"]["tool_calls"][0]
|
||||
tc_id = tool_call["id"]
|
||||
|
||||
print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
|
||||
print(f"[{timestamp()}] Tool call ID: {tc_id}")
|
||||
|
||||
# Add tool response
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"tool_calls": [tool_call]
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": "The current time is 3:45 PM on Thursday, April 9, 2026."
|
||||
})
|
||||
|
||||
# Debug: print the full messages array we're about to send
|
||||
print(f"\n[{timestamp()}] Sending follow-up with these messages:")
|
||||
print(json.dumps(messages, indent=2))
|
||||
|
||||
# Get follow-up
|
||||
response2 = client.post(
|
||||
f"{API_BASE}/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": MODEL,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"stream": False,
|
||||
"max_tokens": 256,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
"logprobs": True,
|
||||
"top_logprobs": 5
|
||||
}
|
||||
)
|
||||
|
||||
result2 = response2.json()
|
||||
print(f"\n[{timestamp()}] Full response:")
|
||||
print(json.dumps(result2, indent=2))
|
||||
|
||||
if result2.get("choices"):
|
||||
content = result2["choices"][0].get("message", {}).get("content", "")
|
||||
|
||||
print(f"\n[{timestamp()}] Model response content: {content}")
|
||||
|
||||
# Check if time is mentioned
|
||||
if "3:45" in content or "3:45 PM" in content:
|
||||
print("\n✓ Model used the tool response (time mentioned)")
|
||||
else:
|
||||
print("\n✗ Model may not have seen the tool response (time not mentioned)")
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*60)
|
||||
print("GLM-5.1 Tool Call Response Tests")
|
||||
print("="*60)
|
||||
|
||||
# Test non-streaming first (simpler to debug)
|
||||
print("\n--- Test 1: Non-streaming tool response flow ---")
|
||||
test_tool_call_response_flow(streaming=False)
|
||||
|
||||
# Test streaming
|
||||
print("\n--- Test 2: Streaming tool response flow ---")
|
||||
test_tool_call_response_flow(streaming=True)
|
||||
|
||||
# Debug test
|
||||
print("\n--- Test 3: Debug info test ---")
|
||||
test_tool_response_with_debug_info()
|
||||
|
||||
print("\nAll tests complete.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user