From 1beaa23c5820550e5a89a7967c3c05826ba12057 Mon Sep 17 00:00:00 2001 From: Jinx Date: Sun, 12 Apr 2026 21:59:03 +0000 Subject: [PATCH] consolidate to run_suite.py: single pluggable test suite, all models 84/84 --- .gitignore | 2 + run_suite.py | 815 +++++++++++++++++++++++++++++++++++ run_tests.sh | 23 +- test_devstral.py | 546 ----------------------- test_streaming_tool_calls.py | 395 ----------------- test_tool_diagnosis.py | 243 ----------- test_tool_response.py | 463 -------------------- 7 files changed, 826 insertions(+), 1661 deletions(-) create mode 100644 run_suite.py delete mode 100644 test_devstral.py delete mode 100644 test_streaming_tool_calls.py delete mode 100644 test_tool_diagnosis.py delete mode 100644 test_tool_response.py diff --git a/.gitignore b/.gitignore index 4c49bd7..bca6a3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ .env +models.env +__pycache__/ diff --git a/run_suite.py b/run_suite.py new file mode 100644 index 0000000..c05df09 --- /dev/null +++ b/run_suite.py @@ -0,0 +1,815 @@ +#!/usr/bin/env python3 +""" +Universal model tool-call test suite. + +Tests any OpenAI-compatible endpoint for: + 1. Basic chat (non-streaming + streaming) + 2. Tool calls (non-streaming + streaming) + 3. Multi-turn tool response flow (non-streaming + streaming) + 4. Nested/bad tool schema handling (SGLang compatibility) + 5. Streaming tool call chunking (are args actually streamed?) + 6. Param sweep (what vLLM params does the endpoint accept?) + +Handles reasoning models (content in 'reasoning' field, null 'content'), +different finish_reason values, and empty/tool_calls arrays gracefully. + +Usage: + TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py + python3 run_suite.py --all + python3 run_suite.py --model 1 + python3 run_suite.py --filter Devstral +""" + +import os +import sys +import json +import time +import httpx +import argparse +from datetime import datetime +from pathlib import Path +from dataclasses import dataclass, field + + +# ── Helpers ────────────────────────────────────────────────── + +def ts(): + return datetime.now().strftime("%H:%M:%S.%f")[:-3] + + +def safe_choice(body: dict, index: int = 0) -> dict: + """Safely get a choice from a response body.""" + choices = body.get("choices") or [] + if index < len(choices): + return choices[index] + return {} + + +def safe_message(body: dict) -> dict: + """Safely get the message from the first choice.""" + return safe_choice(body).get("message") or {} + + +def safe_delta(chunk: dict) -> dict: + """Safely get the delta from the first choice of a streaming chunk.""" + choices = chunk.get("choices") or [] + if choices: + return choices[0].get("delta") or {} + return {} + + +def extract_content(msg: dict) -> tuple[str, str]: + """Extract (content, reasoning) from a message, handling nulls.""" + content = msg.get("content") or "" + reasoning = msg.get("reasoning") or "" + return content, reasoning + + +# ── Config ─────────────────────────────────────────────────── + +@dataclass +class ModelConfig: + api_base: str + api_key: str + model: str + + @property + def label(self): + return self.model.split("/")[-1] + + +def load_models_env(path: Path) -> list[ModelConfig]: + """Load models from the models.env file (pipe-delimited).""" + configs = [] + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + parts = [p.strip() for p in line.split("|")] + if len(parts) >= 3: + configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2])) + return configs + + +def config_from_env() -> ModelConfig | None: + """Get a single config from TOOLTEST_* environment variables.""" + base = os.environ.get("TOOLTEST_API_BASE") + key = os.environ.get("TOOLTEST_API_KEY") + model = os.environ.get("TOOLTEST_MODEL") + if base and key and model: + return ModelConfig(api_base=base, api_key=key, model=model) + return None + + +# ── Test result types ──────────────────────────────────────── + +@dataclass +class TestResult: + name: str + passed: bool + detail: str = "" + duration_s: float = 0.0 + + +@dataclass +class SuiteResult: + model: str + results: list[TestResult] = field(default_factory=list) + + @property + def passed(self): + return sum(1 for r in self.results if r.passed) + + @property + def total(self): + return len(self.results) + + +def make_client(cfg: ModelConfig) -> httpx.Client: + return httpx.Client( + timeout=120.0, + headers={ + "Authorization": f"Bearer {cfg.api_key}", + "Content-Type": "application/json", + }, + ) + + +# ── Shared tool definitions ────────────────────────────────── + +WEATHER_TOOL = { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} + }, + "required": ["location"] + } + } +} + +WRITE_FILE_TOOL = { + "type": "function", + "function": { + "name": "write_file", + "description": "Write content to a file.", + "parameters": { + "type": "object", + "properties": { + "filename": {"type": "string", "description": "Name of the file"}, + "content": {"type": "string", "description": "The content to write"} + }, + "required": ["filename", "content"] + } + } +} + +BAD_SCHEMA_TOOL = { + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web", + "parameters": { + "type": "object", + "properties": [] # Invalid — should be {} + } + } +} + +NESTED_BAD_SCHEMA_TOOL = { + "type": "function", + "function": { + "name": "message", + "description": "Send a message", + "parameters": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": [] # Invalid — should be {} + } + } + } + } + } +} + + +# ── Test functions ─────────────────────────────────────────── + +def test_basic_nonstream(cfg: ModelConfig) -> TestResult: + """1. Basic non-streaming chat.""" + with make_client(cfg) as c: + start = time.time() + try: + r = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "Say hello in one word."}], + "stream": False, + "max_tokens": 64, + }) + body = r.json() + dur = time.time() - start + if r.status_code != 200: + return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur) + content, reasoning = extract_content(safe_message(body)) + fr = safe_choice(body).get("finish_reason", "?") + if content: + return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur) + elif reasoning: + return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur) + else: + return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur) + except Exception as e: + return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start) + + +def test_basic_stream(cfg: ModelConfig) -> TestResult: + """2. Basic streaming chat.""" + with make_client(cfg) as c: + start = time.time() + try: + with c.stream("POST", f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "Count from 1 to 5."}], + "stream": True, + "max_tokens": 64, + }) as r: + if r.status_code != 200: + body = "".join(r.iter_lines()) + dur = time.time() - start + return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur) + full_content = "" + full_reasoning = "" + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + delta = safe_delta(chunk) + if delta.get("content"): + full_content += delta["content"] + if delta.get("reasoning"): + full_reasoning += delta["reasoning"] + except json.JSONDecodeError: + pass + dur = time.time() - start + if full_content: + return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur) + elif full_reasoning: + return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur) + else: + return TestResult("basic stream", False, "No content or reasoning received", dur) + except Exception as e: + return TestResult("basic stream", False, f"Exception: {e}", time.time() - start) + + +def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult: + """3. Tool call — non-streaming.""" + with make_client(cfg) as c: + start = time.time() + try: + r = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], + "tools": [WEATHER_TOOL], + "tool_choice": "auto", + "stream": False, + "max_tokens": 256, + }) + body = r.json() + dur = time.time() - start + if r.status_code != 200: + return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur) + msg = safe_message(body) + tool_calls = msg.get("tool_calls") or [] + if tool_calls: + tc = tool_calls[0] + fn = tc.get("function", {}) + return TestResult("tool call non-stream", True, + f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur) + else: + content, reasoning = extract_content(msg) + out = content or reasoning or "(empty)" + return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur) + except Exception as e: + return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start) + + +def test_toolcall_stream(cfg: ModelConfig) -> TestResult: + """4. Tool call — streaming.""" + with make_client(cfg) as c: + start = time.time() + try: + with c.stream("POST", f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], + "tools": [WEATHER_TOOL], + "tool_choice": "auto", + "stream": True, + "max_tokens": 256, + }) as r: + if r.status_code != 200: + body = "".join(r.iter_lines()) + dur = time.time() - start + return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur) + tool_name = None + accumulated_args = "" + content_parts = "" + reasoning_parts = "" + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + delta = safe_delta(chunk) + tc_list = delta.get("tool_calls") or [] + for tc in tc_list: + fn = tc.get("function") or {} + if fn.get("name"): + tool_name = fn["name"] + if fn.get("arguments"): + accumulated_args += fn["arguments"] + if delta.get("content"): + content_parts += delta["content"] + if delta.get("reasoning"): + reasoning_parts += delta["reasoning"] + except json.JSONDecodeError: + pass + dur = time.time() - start + if tool_name: + return TestResult("tool call stream", True, + f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur) + else: + out = content_parts or reasoning_parts or "(empty)" + return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur) + except Exception as e: + return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start) + + +def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult: + """5/6. Full tool call → response → follow-up flow.""" + label = "tool response flow (stream)" if streaming else "tool response flow" + with make_client(cfg) as c: + start = time.time() + try: + messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + + # Step 1: Get tool call + if not streaming: + r = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": messages, + "tools": [WEATHER_TOOL], + "tool_choice": "auto", + "stream": False, + "max_tokens": 256, + }) + body = r.json() + if r.status_code != 200: + return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start) + msg = safe_message(body) + else: + tool_name = None + tool_id = None + accumulated_args = "" + with c.stream("POST", f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": messages, + "tools": [WEATHER_TOOL], + "tool_choice": "auto", + "stream": True, + "max_tokens": 256, + }) as r: + if r.status_code != 200: + return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start) + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + delta = safe_delta(chunk) + for tc in (delta.get("tool_calls") or []): + if tc.get("id"): + tool_id = tc["id"] + fn = tc.get("function") or {} + if fn.get("name"): + tool_name = fn["name"] + if fn.get("arguments"): + accumulated_args += fn["arguments"] + except json.JSONDecodeError: + pass + if not tool_name: + return TestResult(label, False, "No tool call in step 1", time.time() - start) + msg = { + "role": "assistant", + "tool_calls": [{ + "id": tool_id or "call_0", + "type": "function", + "function": {"name": tool_name, "arguments": accumulated_args} + }] + } + + tool_calls = msg.get("tool_calls") or [] + if not tool_calls: + return TestResult(label, False, "No tool call in step 1", time.time() - start) + + tc = tool_calls[0] + tc_id = tc.get("id", "call_0") + + # Step 2: Send tool response + messages.append(msg) + messages.append({ + "role": "tool", + "tool_call_id": tc_id, + "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}), + }) + + # Step 3: Get follow-up + r2 = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": messages, + "tools": [WEATHER_TOOL], + "stream": False, + "max_tokens": 256, + }) + body2 = r2.json() + dur = time.time() - start + if r2.status_code != 200: + return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur) + + final_msg = safe_message(body2) + final_content, final_reasoning = extract_content(final_msg) + final = final_content or final_reasoning or "" + + # Check the model actually used the tool data + ok = "22" in final + indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"] + for ind in indicators: + if ind in final.lower(): + ok = False + break + if not final_content and final_reasoning: + return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur) + return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur) + except Exception as e: + return TestResult(label, False, f"Exception: {e}", time.time() - start) + + +def test_bad_tool_schema(cfg: ModelConfig) -> TestResult: + """7. OpenClaw-style tool with properties=[] (tests schema validation/middleware).""" + with make_client(cfg) as c: + start = time.time() + try: + r = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "Search for cats"}], + "tools": [BAD_SCHEMA_TOOL], + "tool_choice": "auto", + "stream": False, + "max_tokens": 128, + }) + body = r.json() + dur = time.time() - start + if r.status_code != 200: + err = "" + try: + err = body.get("error", {}).get("message", "")[:150] + except Exception: + err = json.dumps(body)[:150] + return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur) + return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur) + except Exception as e: + return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start) + + +def test_nested_bad_schema(cfg: ModelConfig) -> TestResult: + """8. Nested properties=[] inside items (the Tool 21 bug).""" + with make_client(cfg) as c: + start = time.time() + try: + r = c.post(f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{"role": "user", "content": "Send a message to Bob"}], + "tools": [NESTED_BAD_SCHEMA_TOOL], + "tool_choice": "auto", + "stream": False, + "max_tokens": 128, + }) + body = r.json() + dur = time.time() - start + if r.status_code != 200: + err = "" + try: + err = body.get("error", {}).get("message", "")[:150] + except Exception: + err = json.dumps(body)[:150] + return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur) + return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur) + except Exception as e: + return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start) + + +def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult: + """9. Streaming tool call chunking — are args actually streamed in multiple chunks?""" + with make_client(cfg) as c: + start = time.time() + try: + with c.stream("POST", f"{cfg.api_base}/chat/completions", json={ + "model": cfg.model, + "messages": [{ + "role": "user", + "content": "Write a Python hello world and save it using the write_file tool." + }], + "tools": [WRITE_FILE_TOOL], + "tool_choice": "auto", + "stream": True, + "max_tokens": 1024, + }) as r: + if r.status_code != 200: + dur = time.time() - start + return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur) + + tool_name = None + arg_chunks = 0 + accumulated_args = "" + content_chunks = 0 + reasoning_chunks = 0 + for line in r.iter_lines(): + if not line or line == "data: [DONE]": + continue + if line.startswith("data: "): + try: + chunk = json.loads(line[6:]) + delta = safe_delta(chunk) + for tc in (delta.get("tool_calls") or []): + fn = tc.get("function") or {} + if fn.get("name"): + tool_name = fn["name"] + if fn.get("arguments"): + arg_chunks += 1 + accumulated_args += fn["arguments"] + if delta.get("content"): + content_chunks += 1 + if delta.get("reasoning"): + reasoning_chunks += 1 + except json.JSONDecodeError: + pass + + dur = time.time() - start + if not tool_name: + if content_chunks > 0 or reasoning_chunks > 0: + return TestResult("streaming tool chunking", False, + f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur) + return TestResult("streaming tool chunking", False, "No tool call and no content", dur) + + # Evaluate chunking quality + if arg_chunks > 1: + return TestResult("streaming tool chunking", True, + f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur) + elif arg_chunks == 1 and len(accumulated_args) > 500: + return TestResult("streaming tool chunking", False, + f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur) + elif arg_chunks == 1: + return TestResult("streaming tool chunking", True, + f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur) + else: + return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur) + except Exception as e: + return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start) + + +def test_param_sweep(cfg: ModelConfig) -> list[TestResult]: + """10. Parameter sweep — which vLLM params does the endpoint accept?""" + results = [] + base_req = { + "model": cfg.model, + "messages": [{"role": "user", "content": "Say hi."}], + "stream": False, + "max_tokens": 32, + } + extra_params = [ + ("chat_template_kwargs", {"enable_thinking": False}), + ("guided_json", None), + ("guided_regex", None), + ("response_format", {"type": "json_object"}), + ("n", 1), + ("presence_penalty", 0.0), + ("frequency_penalty", 0.0), + ("top_p", 1.0), + ("temperature", 0.7), + ("seed", 42), + ("stop", ["\n"]), + ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}), + ] + + with make_client(cfg) as c: + for name, val in extra_params: + start = time.time() + try: + if isinstance(val, dict): + req = {**base_req, **val} + else: + req = {**base_req, name: val} + r = c.post(f"{cfg.api_base}/chat/completions", json=req) + dur = time.time() - start + ok = r.status_code == 200 + detail = f"HTTP {r.status_code}" + if not ok: + try: + detail += f": {r.json().get('error', {}).get('message', '')[:80]}" + except Exception: + pass + results.append(TestResult(f"param: {name}", ok, detail, dur)) + except Exception as e: + results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start)) + + return results + + +# ── Suite runner ───────────────────────────────────────────── + +ALL_TESTS = [ + test_basic_nonstream, + test_basic_stream, + test_toolcall_nonstream, + test_toolcall_stream, + lambda cfg: test_tool_response_flow(cfg, streaming=False), + lambda cfg: test_tool_response_flow(cfg, streaming=True), + test_bad_tool_schema, + test_nested_bad_schema, + test_streaming_tool_chunks, +] + + +def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult: + """Run the full test suite against one model config.""" + result = SuiteResult(model=cfg.model) + + print(f"\n{'='*60}") + print(f"Testing: {cfg.model}") + print(f"API: {cfg.api_base}") + print(f"{'='*60}") + + for test_fn in ALL_TESTS: + name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__ + if verbose: + print(f"\n[{ts()}] Running: {name}...") + + tr = test_fn(cfg) + if isinstance(tr, list): + result.results.extend(tr) + else: + result.results.append(tr) + + if verbose: + if isinstance(tr, list): + for r in tr: + s = "✓" if r.passed else "✗" + print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)") + else: + s = "✓" if tr.passed else "✗" + print(f" {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)") + + # Param sweep + if verbose: + print(f"\n[{ts()}] Running: parameter sweep...") + sweep_results = test_param_sweep(cfg) + result.results.extend(sweep_results) + if verbose: + for r in sweep_results: + s = "✓" if r.passed else "✗" + print(f" {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)") + + return result + + +def print_summary(results: list[SuiteResult]): + """Print a final summary across all models.""" + print(f"\n\n{'='*60}") + print("FINAL SUMMARY") + print(f"{'='*60}") + + for sr in results: + passed = sr.passed + total = sr.total + pct = (passed / total * 100) if total else 0 + label = sr.model.split("/")[-1] + print(f"\n {label}: {passed}/{total} passed ({pct:.0f}%)") + + for r in sr.results: + if not r.passed: + print(f" ✗ {r.name}: {r.detail[:80]}") + + # Cross-model comparison for key tests + print(f"\n{'─'*60}") + print("CROSS-MODEL COMPARISON") + print(f"{'─'*60}") + key_tests = [ + "basic non-stream", + "basic stream", + "tool call non-stream", + "tool call stream", + "tool response flow", + "tool response flow (stream)", + "streaming tool chunking", + "bad tool schema (properties=[])", + "nested bad schema (items.properties=[])", + ] + + # Calculate column width + labels = [sr.model.split("/")[-1][:18] for sr in results] + col_w = max(len(l) for l in labels) if labels else 16 + col_w = max(col_w, 16) + + header = f"{'Test':<40}" + for l in labels: + header += f" {l:>{col_w}}" + print(header) + print("─" * len(header)) + + for test_name in key_tests: + row = f"{test_name:<40}" + for sr in results: + match = [r for r in sr.results if r.name == test_name] + if match: + status = "✓" if match[0].passed else "✗" + row += f" {status:>{col_w}}" + else: + row += f" {'—':>{col_w}}" + print(row) + + print(f"\n{'='*60}") + + +# ── CLI ────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Universal model tool-call test suite") + parser.add_argument("--all", action="store_true", help="Test all models from models.env") + parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env") + parser.add_argument("--filter", type=str, help="Test models matching substring") + parser.add_argument("--quiet", action="store_true", help="Less output per test") + args = parser.parse_args() + + models_path = Path(__file__).parent / "models.env" + + configs: list[ModelConfig] = [] + + if args.all: + if not models_path.exists(): + print("ERROR: models.env not found") + sys.exit(1) + configs = load_models_env(models_path) + elif args.model: + if not models_path.exists(): + print("ERROR: models.env not found") + sys.exit(1) + all_configs = load_models_env(models_path) + if args.model < 1 or args.model > len(all_configs): + print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})") + sys.exit(1) + configs = [all_configs[args.model - 1]] + elif args.filter: + if not models_path.exists(): + print("ERROR: models.env not found") + sys.exit(1) + all_configs = load_models_env(models_path) + configs = [c for c in all_configs if args.filter.lower() in c.model.lower()] + if not configs: + print(f"No models matching '{args.filter}'") + sys.exit(1) + else: + cfg = config_from_env() + if cfg: + configs = [cfg] + else: + print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.") + if models_path.exists(): + print("\nAvailable models from models.env:") + for i, c in enumerate(load_models_env(models_path), 1): + print(f" {i}. {c.model} @ {c.api_base}") + sys.exit(1) + + all_results: list[SuiteResult] = [] + for cfg in configs: + sr = run_suite(cfg, verbose=not args.quiet) + all_results.append(sr) + + print_summary(all_results) + + if any(sr.passed < sr.total for sr in all_results): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/run_tests.sh b/run_tests.sh index 6af8dfd..af590dd 100644 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,19 +1,14 @@ -#!/bin/bash -# Run the streaming tool call tests - +#!/usr/bin/env bash set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Default values -export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}" -export VLLM_API_KEY="${VLLM_API_KEY:-none}" -export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}" +# Usage: +# ./run_tests.sh # Test all models from models.env +# ./run_tests.sh --model 1 # Test model #1 +# ./run_tests.sh --filter Devstral # Test matching models +# ./run_tests.sh --all # Same as no args +# ./run_tests.sh --quiet # Less output -echo "Configuration:" -echo " API_BASE: $VLLM_API_BASE" -echo " MODEL: $VLLM_MODEL" -echo "" - -# Run the test -python3 "$SCRIPT_DIR/test_streaming_tool_calls.py" +cd "$SCRIPT_DIR" +python3 -u run_suite.py "$@" diff --git a/test_devstral.py b/test_devstral.py deleted file mode 100644 index 338babd..0000000 --- a/test_devstral.py +++ /dev/null @@ -1,546 +0,0 @@ -#!/usr/bin/env python3 -""" -Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware. - -These tests send EXACTLY what OpenClaw would send to vLLM — including -chat_template_kwargs, logprobs, weird tool schemas, the works. -The middleware's job is to strip/fix all of it so SGLang doesn't choke. - -Architecture: this test → middleware (strips bad params) → SGLang -""" - -import os -import time -import json -import httpx -from datetime import datetime -from pathlib import Path - -# Load .env if present (don't hardcode keys) -_env_file = Path(__file__).parent / ".env" -if _env_file.exists(): - for line in _env_file.read_text().splitlines(): - line = line.strip() - if not line or line.startswith("#") or "=" not in line: - continue - k, v = line.split("=", 1) - os.environ.setdefault(k.strip(), v.strip()) - -API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1") -API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever") -MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512") - -RESULTS = [] - - -def ts(): - return datetime.now().strftime("%H:%M:%S.%f")[:-3] - - -def record(name, ok, detail=""): - status = "✓ PASS" if ok else "✗ FAIL" - print(f"\n{status}: {name}") - if detail: - print(f" {detail}") - RESULTS.append({"name": name, "pass": ok, "detail": detail}) - - -def make_client(): - return httpx.Client( - timeout=120.0, - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json", - }, - ) - - -# ── 1. Basic non-streaming chat ────────────────────────────── - -def test_basic_nonstream(): - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Basic non-streaming chat") - print(f"{'='*60}") - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "Say hello in one word."}], - "stream": False, - "max_tokens": 32, - }) - print(f"[{ts()}] Status: {r.status_code}") - body = r.json() - if r.status_code != 200: - print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") - record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") - return - content = body["choices"][0]["message"]["content"] - print(f"[{ts()}] Reply: {content[:100]}") - record("basic non-stream", True, f"Got: {content[:80]}") - - -# ── 2. Basic streaming chat ────────────────────────────────── - -def test_basic_stream(): - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Basic streaming chat") - print(f"{'='*60}") - - with make_client() as c: - with c.stream("POST", f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "Count from 1 to 5."}], - "stream": True, - "max_tokens": 64, - }) as r: - print(f"[{ts()}] Status: {r.status_code}") - if r.status_code != 200: - body = "".join(r.iter_lines()) - print(f"[{ts()}] Error: {body[:300]}") - record("basic stream", False, f"HTTP {r.status_code}") - return - full = "" - for line in r.iter_lines(): - if not line or line == "data: [DONE]": - continue - if line.startswith("data: "): - try: - chunk = json.loads(line[6:]) - if not chunk.get("choices"): continue - delta = chunk["choices"][0].get("delta", {}) - if delta.get("content"): - full += delta["content"] - except json.JSONDecodeError: - pass - print(f"[{ts()}] Reply: {full[:100]}") - record("basic stream", True, f"Got: {full[:80]}") - - -# ── 3. Tool call — non-streaming (vLLM-style tool schema) ─── - -def test_toolcall_nonstream(): - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)") - print(f"{'='*60}") - - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} - }, - "required": ["location"] - } - } - }] - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 256, - }) - print(f"[{ts()}] Status: {r.status_code}") - body = r.json() - if r.status_code != 200: - print(f"[{ts()}] Error: {json.dumps(body, indent=2)}") - record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") - return - msg = body["choices"][0]["message"] - if msg.get("tool_calls"): - tc = msg["tool_calls"][0] - print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}") - record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}") - else: - content = msg.get("content", "") - print(f"[{ts()}] No tool call. Content: {content[:200]}") - record("tool call non-stream", False, "Model did not call the tool") - - -# ── 4. Tool call — streaming ──────────────────────────────── - -def test_toolcall_stream(): - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Tool call streaming") - print(f"{'='*60}") - - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} - }, - "required": ["location"] - } - } - }] - - with make_client() as c: - with c.stream("POST", f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}], - "tools": tools, - "tool_choice": "auto", - "stream": True, - "max_tokens": 256, - }) as r: - print(f"[{ts()}] Status: {r.status_code}") - if r.status_code != 200: - body = "".join(r.iter_lines()) - print(f"[{ts()}] Error: {body[:300]}") - record("tool call stream", False, f"HTTP {r.status_code}") - return - tool_name = None - accumulated_args = "" - content_parts = "" - for line in r.iter_lines(): - if not line or line == "data: [DONE]": - continue - if line.startswith("data: "): - try: - chunk = json.loads(line[6:]) - if not chunk.get("choices"): continue - delta = chunk["choices"][0].get("delta", {}) - if delta.get("tool_calls"): - for tc in delta["tool_calls"]: - if tc.get("function", {}).get("name"): - tool_name = tc["function"]["name"] - if tc.get("function", {}).get("arguments"): - accumulated_args += tc["function"]["arguments"] - if delta.get("content"): - content_parts += delta["content"] - except json.JSONDecodeError: - pass - - if tool_name: - print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}") - record("tool call stream", True, f"Got tool call: {tool_name}") - else: - print(f"[{ts()}] No tool call. Content: {content_parts[:200]}") - record("tool call stream", False, "Model did not call the tool") - - -# ── 5. Full tool response flow (non-streaming) ────────────── - -def test_tool_response_flow(): - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Full tool response flow (non-streaming)") - print(f"{'='*60}") - - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City, e.g. 'Tokyo'"} - }, - "required": ["location"] - } - } - }] - - messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 256, - }) - body = r.json() - if r.status_code != 200: - record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}") - return - msg = body["choices"][0]["message"] - if not msg.get("tool_calls"): - record("tool response flow", False, "No tool call in step 1") - return - - tc = msg["tool_calls"][0] - tc_id = tc["id"] - print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})") - - messages.append(msg) - messages.append({ - "role": "tool", - "tool_call_id": tc_id, - "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}), - }) - - r2 = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 256, - }) - body2 = r2.json() - if r2.status_code != 200: - print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}") - record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}") - return - - final = body2["choices"][0]["message"].get("content", "") - print(f"[{ts()}] Final: {final[:200]}") - ok = "22" in final - record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}") - - -# ── 6. Param sweep — everything OpenClaw/vLLM sends ───────── - -def test_param_sweep(): - """ - Sends EVERY param that OpenClaw or vLLM might include. - The middleware must strip/fix the ones SGLang rejects. - """ - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)") - print(f"{'='*60}") - - base_req = { - "model": MODEL, - "messages": [{"role": "user", "content": "Say hi."}], - "stream": False, - "max_tokens": 32, - } - - # Params that OpenClaw/vLLM might send — some SGLang rejects - extra_params = [ - ("chat_template_kwargs", {"enable_thinking": False}), - ("guided_json", None), - ("guided_regex", None), - ("response_format", {"type": "json_object"}), - ("n", 1), - ("presence_penalty", 0.0), - ("frequency_penalty", 0.0), - ("top_p", 1.0), - ("temperature", 0.7), - ("seed", 42), - ("stop", ["\n"]), - ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}), - ("top_logprobs", 5), - ] - - with make_client() as c: - # baseline - r = c.post(f"{API_BASE}/chat/completions", json=base_req) - print(f"[{ts()}] Baseline: {r.status_code}") - - for name, val in extra_params: - req = {**base_req, name: val} - r = c.post(f"{API_BASE}/chat/completions", json=req) - status = "✓" if r.status_code == 200 else "✗" - detail = "" - if r.status_code != 200: - try: - detail = r.json().get("error", {}).get("message", "")[:100] - except Exception: - detail = r.text[:100] - print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}") - if r.status_code != 200: - record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}") - - -# ── 7. OpenClaw-style tool schema (the one that caused 400) ─ - -def test_openclaw_tool_schema(): - """ - Reproduce the exact tool schema that OpenClaw sends which has - parameters.properties = [] instead of {}. Middleware must fix it. - """ - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)") - print(f"{'='*60}") - - # This is the exact shape OpenClaw sends for tools with no params - tools = [{ - "type": "function", - "function": { - "name": "web_search", - "description": "Search the web", - "parameters": { - "type": "object", - "properties": [] # <-- THIS is what causes the 400 - } - } - }] - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "Search for cats"}], - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 128, - }) - print(f"[{ts()}] Status: {r.status_code}") - body = r.json() - if r.status_code != 200: - print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") - record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") - return - print(f"[{ts()}] Success — middleware fixed the bad schema") - record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}") - - -# ── 8. Nested properties=[] in tool schema (Tool 21 bug) ──── - -def test_nested_bad_properties(): - """ - Reproduce the exact Tool 21 400 error: - schema['properties']['fields']['items']['properties'] = [] - - This happens when a tool has an array-of-objects parameter where - the items' properties field is [] instead of {}. The middleware - must recurse into the schema to fix ALL properties fields. - """ - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)") - print(f"{'='*60}") - - # This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema: - # [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']" - tools = [{ - "type": "function", - "function": { - "name": "message", - "description": "Send a message", - "parameters": { - "type": "object", - "properties": { - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": [] # <-- THIS causes the 400 - } - } - } - } - } - }] - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [{"role": "user", "content": "Send a message to Bob"}], - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 128, - }) - print(f"[{ts()}] Status: {r.status_code}") - body = r.json() - if r.status_code != 200: - print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}") - record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") - return - print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}") - record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}") - - -# ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─ - -def test_openclaw_full_payload(): - """ - The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas. - Exactly what OpenClaw sends through the pipe. - """ - print(f"\n{'='*60}") - print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)") - print(f"{'='*60}") - - tools = [{ - "type": "function", - "function": { - "name": "web_search", - "description": "Search the web using DuckDuckGo.", - "parameters": { - "type": "object", - "properties": [] # Bad — middleware must fix - } - } - }] - - with make_client() as c: - r = c.post(f"{API_BASE}/chat/completions", json={ - "model": MODEL, - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Search for the weather in NYC"}, - ], - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 256, - "chat_template_kwargs": {"enable_thinking": False}, # Bad — middleware must strip - "logprobs": True, # Bad — middleware must strip - "top_logprobs": 5, # Bad — middleware must strip - }) - print(f"[{ts()}] Status: {r.status_code}") - body = r.json() - if r.status_code != 200: - print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}") - record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}") - return - msg = body["choices"][0]["message"] - print(f"[{ts()}] Success — middleware cleaned everything") - if msg.get("tool_calls"): - tc = msg["tool_calls"][0] - print(f"[{ts()}] Tool call: {tc['function']['name']}") - else: - print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}") - record("openclaw full payload", True, "Full OpenClaw payload survived the middleware") - - -# ── Main ───────────────────────────────────────────────────── - -def main(): - print(f"\n{'='*60}") - print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)") - print(f"API: {API_BASE}") - print(f"Model: {MODEL}") - print(f"{'='*60}") - - test_basic_nonstream() - test_basic_stream() - test_toolcall_nonstream() - test_toolcall_stream() - test_tool_response_flow() - test_param_sweep() - test_openclaw_tool_schema() - test_nested_bad_properties() - test_openclaw_full_payload() - - print(f"\n\n{'='*60}") - print("FINAL RESULTS") - print(f"{'='*60}") - for r in RESULTS: - s = "✓" if r["pass"] else "✗" - print(f" {s} {r['name']}: {r['detail']}") - passed = sum(1 for r in RESULTS if r["pass"]) - print(f"\n {passed}/{len(RESULTS)} passed") - print(f"{'='*60}") - - -if __name__ == "__main__": - main() diff --git a/test_streaming_tool_calls.py b/test_streaming_tool_calls.py deleted file mode 100644 index 3bddb7b..0000000 --- a/test_streaming_tool_calls.py +++ /dev/null @@ -1,395 +0,0 @@ -#!/usr/bin/env python3 -""" -Test suite for vLLM GLM-5.1 streaming tool calls. - -Reproduces the issue where long string parameters in tool calls -are buffered entirely before being emitted during streaming. -""" - -import os -import time -import json -import httpx -from datetime import datetime - - -# Configuration - will be set via environment or direct assignment -API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1") -API_KEY = os.environ.get("VLLM_API_KEY", "none") -MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B") - - -def timestamp(): - return datetime.now().strftime("%H:%M:%S.%f")[:-3] - - -def test_streaming_tool_call_with_code(): - """ - Test streaming a tool call with a long string parameter. - - This prompts the model to generate code via a tool call, - which should stream incrementally if the patch works correctly. - """ - - tools = [ - { - "type": "function", - "function": { - "name": "write_file", - "description": "Write content to a file. Use this to save code, text, or other content.", - "parameters": { - "type": "object", - "properties": { - "filename": { - "type": "string", - "description": "Name of the file to write" - }, - "content": { - "type": "string", - "description": "The content to write to the file" - } - }, - "required": ["filename", "content"] - } - } - } - ] - - messages = [ - { - "role": "user", - "content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool." - } - ] - - print(f"\n{'='*60}") - print(f"TEST: Streaming tool call with long string parameter") - print(f"API: {API_BASE}") - print(f"Model: {MODEL}") - print(f"{'='*60}\n") - - # Track streaming events - chunks_received = [] - first_chunk_time = None - last_chunk_time = None - tool_call_chunks = [] - accumulated_content = "" - - start_time = time.time() - - with httpx.Client(timeout=120.0) as client: - with client.stream( - "POST", - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": True, - "max_tokens": 4096, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) as response: - print(f"[{timestamp()}] Response status: {response.status_code}") - - for line in response.iter_lines(): - if not line or line == "data: [DONE]": - continue - - if line.startswith("data: "): - chunk_data = line[6:] - try: - chunk = json.loads(chunk_data) - - if first_chunk_time is None: - first_chunk_time = time.time() - print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)") - - last_chunk_time = time.time() - chunks_received.append(chunk) - - # Extract delta content - if chunk.get("choices"): - delta = chunk["choices"][0].get("delta", {}) - - # Check for tool calls in delta - if delta.get("tool_calls"): - for tc in delta["tool_calls"]: - tc_index = tc.get("index", 0) - tc_function = tc.get("function", {}) - - if tc_function.get("name"): - print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}") - - if tc_function.get("arguments"): - args_chunk = tc_function["arguments"] - tool_call_chunks.append(args_chunk) - accumulated_content += args_chunk - - # Print progress every ~500 chars - if len(accumulated_content) % 500 < len(args_chunk): - print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...") - - # Regular content - if delta.get("content"): - print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...") - - except json.JSONDecodeError as e: - print(f"[{timestamp()}] JSON decode error: {e}") - - end_time = time.time() - - # Summary - print(f"\n{'='*60}") - print("SUMMARY") - print(f"{'='*60}") - print(f"Total chunks received: {len(chunks_received)}") - print(f"Total time: {end_time - start_time:.3f}s") - - if first_chunk_time: - print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s") - - if tool_call_chunks: - print(f"Tool call chunks: {len(tool_call_chunks)}") - print(f"Total tool call content: {len(accumulated_content)} chars") - - # Try to parse the accumulated arguments - print(f"\nAttempting to parse tool call arguments...") - try: - args = json.loads(accumulated_content) - print(f"Successfully parsed!") - print(f" - filename: {args.get('filename', 'N/A')}") - print(f" - content length: {len(args.get('content', ''))} chars") - except json.JSONDecodeError as e: - print(f"Failed to parse: {e}") - print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}") - - # Verdict - print(f"\n{'='*60}") - if len(tool_call_chunks) > 1: - print("✓ PASS: Tool call arguments arrived in multiple chunks") - print(f" Chunks: {len(tool_call_chunks)}, indicating incremental streaming") - elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000: - print("✗ FAIL: Tool call arguments arrived in a single chunk") - print(" This indicates buffering, not true streaming") - else: - print("? INCONCLUSIVE: Not enough data or no tool call occurred") - print(f"{'='*60}\n") - - return { - "chunks_received": len(chunks_received), - "tool_call_chunks": len(tool_call_chunks), - "accumulated_length": len(accumulated_content), - "total_time": end_time - start_time - } - - -def test_streaming_tool_call_with_json(): - """ - Test streaming a tool call that returns structured JSON data. - """ - - tools = [ - { - "type": "function", - "function": { - "name": "save_config", - "description": "Save a configuration object", - "parameters": { - "type": "object", - "properties": { - "config": { - "type": "object", - "description": "Configuration object with many fields" - } - }, - "required": ["config"] - } - } - } - ] - - messages = [ - { - "role": "user", - "content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool." - } - ] - - print(f"\n{'='*60}") - print(f"TEST: Streaming tool call with nested JSON") - print(f"{'='*60}\n") - - tool_call_chunks = [] - accumulated_content = "" - start_time = time.time() - - with httpx.Client(timeout=120.0) as client: - with client.stream( - "POST", - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": True, - "max_tokens": 2048, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) as response: - for line in response.iter_lines(): - if not line or line == "data: [DONE]": - continue - - if line.startswith("data: "): - try: - chunk = json.loads(line[6:]) - if chunk.get("choices"): - delta = chunk["choices"][0].get("delta", {}) - if delta.get("tool_calls"): - for tc in delta["tool_calls"]: - if tc.get("function", {}).get("arguments"): - args_chunk = tc["function"]["arguments"] - tool_call_chunks.append(args_chunk) - accumulated_content += args_chunk - print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})") - except json.JSONDecodeError: - pass - - end_time = time.time() - - print(f"\n{'='*60}") - print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars") - print(f"Time: {end_time - start_time:.3f}s") - - if len(tool_call_chunks) > 1: - print("✓ PASS: Arguments streamed in multiple chunks") - elif len(tool_call_chunks) == 1: - print("✗ FAIL: Arguments arrived in single chunk (buffered)") - else: - print("? No tool call occurred") - print(f"{'='*60}\n") - - -def test_non_streaming_tool_call(): - """ - Baseline test: non-streaming tool call for comparison. - """ - - tools = [ - { - "type": "function", - "function": { - "name": "write_file", - "description": "Write content to a file", - "parameters": { - "type": "object", - "properties": { - "filename": {"type": "string"}, - "content": {"type": "string"} - }, - "required": ["filename", "content"] - } - } - } - ] - - messages = [ - { - "role": "user", - "content": "Write a simple Python hello world and save it using the write_file tool." - } - ] - - print(f"\n{'='*60}") - print(f"TEST: Non-streaming tool call (baseline)") - print(f"{'='*60}\n") - - start_time = time.time() - - with httpx.Client(timeout=120.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 1024, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - end_time = time.time() - - print(f"Status: {response.status_code}") - print(f"Time: {end_time - start_time:.3f}s") - - if result.get("choices"): - message = result["choices"][0].get("message", {}) - if message.get("tool_calls"): - for tc in message["tool_calls"]: - print(f"Tool: {tc['function']['name']}") - args = json.loads(tc["function"]["arguments"]) - print(f"Arguments parsed successfully") - print(f" - filename: {args.get('filename')}") - print(f" - content length: {len(args.get('content', ''))}") - else: - print("No tool call in response") - - print(f"{'='*60}\n") - - -def main(): - print("\n" + "="*60) - print("vLLM GLM-5.1 Streaming Tool Call Tests") - print("="*60) - - # Check API connectivity - print(f"\nChecking API at {API_BASE}...") - try: - with httpx.Client(timeout=10.0) as client: - response = client.get(f"{API_BASE.replace('/v1', '')}/health") - print(f"Health check: {response.status_code}") - except Exception as e: - print(f"Warning: Could not reach API - {e}") - - # Run tests - print("\nRunning tests...\n") - - # Test 1: Non-streaming baseline - test_non_streaming_tool_call() - - # Test 2: Streaming with nested JSON - test_streaming_tool_call_with_json() - - # Test 3: Main test - streaming with long code - result = test_streaming_tool_call_with_code() - - print("\nAll tests complete.") - - -if __name__ == "__main__": - main() diff --git a/test_tool_diagnosis.py b/test_tool_diagnosis.py deleted file mode 100644 index 17fe4d0..0000000 --- a/test_tool_diagnosis.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python3 -""" -Focused test to diagnose GLM-5.1 tool response issue. - -The issue: Model sees tool response as blank. -""" - -import httpx -import json - -API_BASE = "http://95.179.247.150/v1" -API_KEY = "whatever" -MODEL = "HuggingFaceTB/SmolLM3-3B" - - -def test_simple_tool_response(): - """ - Minimal test: Send a tool response and see if the model can use it. - """ - - # Simulate a conversation where a tool was called - messages = [ - {"role": "user", "content": "Call the test function"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "test_func", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "SUCCESS: The function returned value 42" - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "test_func", - "description": "A test function", - "parameters": {"type": "object", "properties": {}} - } - }] - - print("=" * 60) - print("Request messages:") - print(json.dumps(messages, indent=2)) - print("=" * 60) - - with httpx.Client(timeout=60.0) as client: - # Non-streaming to get full response - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 256, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - - print("\nFull response:") - print(json.dumps(result, indent=2)) - - if result.get("choices"): - content = result["choices"][0].get("message", {}).get("content", "") - print("\n" + "=" * 60) - print("Model response content:") - print(content) - print("=" * 60) - - # Check if the tool result is referenced - if "42" in content: - print("\n✓ PASS: Model referenced the tool result (42)") - else: - print("\n✗ FAIL: Model did NOT reference the tool result (42)") - - # Check for signs the model didn't see the result - if "don't have" in content.lower() or "cannot access" in content.lower(): - print("✗ Model indicates it cannot see tool result") - - -def test_without_tools_param(): - """ - Test what happens if we don't pass tools in the follow-up request. - Some APIs need tools to be passed on every request. - """ - - messages = [ - {"role": "user", "content": "Call the test function"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "test_func", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "SUCCESS: The function returned value 42" - } - ] - - print("\n" + "=" * 60) - print("Test WITHOUT tools param in follow-up") - print("=" * 60) - - with httpx.Client(timeout=60.0) as client: - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - # No tools param - "stream": False, - "max_tokens": 256, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - - if result.get("choices"): - content = result["choices"][0].get("message", {}).get("content", "") - print("Model response:", content[:200]) - - if "42" in content: - print("✓ Model referenced the tool result") - - -def test_different_content_formats(): - """ - Test if the issue is with how content is formatted. - """ - - # Test 1: String content (standard) - messages_string = [ - {"role": "user", "content": "What is 2+2?"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "calc", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": "The answer is 4" - } - ] - - # Test 2: Content as array (OpenAI format) - messages_array = [ - {"role": "user", "content": "What is 2+2?"}, - { - "role": "assistant", - "tool_calls": [{ - "id": "call_123", - "type": "function", - "function": {"name": "calc", "arguments": "{}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_123", - "content": [{"type": "text", "text": "The answer is 4"}] - } - ] - - tools = [{ - "type": "function", - "function": { - "name": "calc", - "description": "Calculator", - "parameters": {"type": "object", "properties": {}} - } - }] - - print("\n" + "=" * 60) - print("Test: String content vs Array content") - print("=" * 60) - - with httpx.Client(timeout=60.0) as client: - for name, msgs in [("String content", messages_string), ("Array content", messages_array)]: - print(f"\n--- {name} ---") - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": msgs, - "tools": tools, - "stream": False, - "max_tokens": 128, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - if result.get("choices"): - content = result["choices"][0].get("message", {}).get("content", "") - print(f"Response: {content[:150]}") - if "4" in content: - print("✓ Referenced tool result") - else: - print("✗ Did NOT reference tool result") - - -if __name__ == "__main__": - print("GLM-5.1 Tool Response Diagnosis") - print("=" * 60) - - test_simple_tool_response() - test_without_tools_param() - test_different_content_formats() diff --git a/test_tool_response.py b/test_tool_response.py deleted file mode 100644 index b3853ab..0000000 --- a/test_tool_response.py +++ /dev/null @@ -1,463 +0,0 @@ -#!/usr/bin/env python3 -""" -Test for tool call response handling in GLM-5.1. - -Tests the multi-turn flow: -1. Send a prompt that triggers a tool call -2. Send back the tool result -3. Verify the model can see and use the tool response - -This reproduces the issue where tool responses appear blank to the model. -""" - -import os -import json -import httpx -from datetime import datetime - - -API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1") -API_KEY = os.environ.get("VLLM_API_KEY", "none") -MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B") - - -def timestamp(): - return datetime.now().strftime("%H:%M:%S.%f")[:-3] - - -def test_tool_call_response_flow(streaming: bool = True): - """ - Test the full tool call -> response -> follow-up flow. - - This simulates: - 1. User asks for weather - 2. Model calls get_weather tool - 3. We send back the weather data - 4. Model should see and use that data - """ - - tools = [ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "City and state, e.g. 'New York, NY'" - } - }, - "required": ["location"] - } - } - } - ] - - # Initial request that should trigger a tool call - messages = [ - { - "role": "user", - "content": "What's the weather like in Tokyo right now?" - } - ] - - mode = "STREAMING" if streaming else "NON-STREAMING" - print(f"\n{'='*60}") - print(f"TEST: Tool call response flow ({mode})") - print(f"API: {API_BASE}") - print(f"Model: {MODEL}") - print(f"{'='*60}\n") - - with httpx.Client(timeout=120.0) as client: - # Step 1: Send initial request, expect tool call - print(f"[{timestamp()}] Step 1: Sending initial request...") - - if streaming: - tool_calls = [] - tool_call_id = None - tool_call_name = None - accumulated_args = "" - - with client.stream( - "POST", - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": True, - "max_tokens": 512, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) as response: - print(f"[{timestamp()}] Response status: {response.status_code}") - - for line in response.iter_lines(): - if not line or line == "data: [DONE]": - continue - - if line.startswith("data: "): - try: - chunk = json.loads(line[6:]) - if chunk.get("choices"): - delta = chunk["choices"][0].get("delta", {}) - - if delta.get("tool_calls"): - for tc in delta["tool_calls"]: - idx = tc.get("index", 0) - - if tc.get("id"): - tool_call_id = tc["id"] - - if tc.get("function", {}).get("name"): - tool_call_name = tc["function"]["name"] - print(f"[{timestamp()}] Tool call: {tool_call_name}") - - if tc.get("function", {}).get("arguments"): - accumulated_args += tc["function"]["arguments"] - - if delta.get("content"): - print(f"[{timestamp()}] Content: {delta['content'][:100]}") - - except json.JSONDecodeError as e: - print(f"[{timestamp()}] JSON error: {e}") - - if tool_call_name: - tool_calls.append({ - "id": tool_call_id or "call_0", - "type": "function", - "function": { - "name": tool_call_name, - "arguments": accumulated_args - } - }) - else: - # Non-streaming - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 512, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - print(f"[{timestamp()}] Response status: {response.status_code}") - - tool_calls = [] - if result.get("choices"): - message = result["choices"][0].get("message", {}) - if message.get("tool_calls"): - tool_calls = message["tool_calls"] - for tc in tool_calls: - print(f"[{timestamp()}] Tool call: {tc['function']['name']}") - print(f"[{timestamp()}] Args: {tc['function']['arguments']}") - - # Check if we got a tool call - if not tool_calls: - print(f"\n[{timestamp()}] No tool call received - model didn't call the tool") - return {"success": False, "reason": "no_tool_call"} - - # Step 2: Parse tool call and prepare response - tc = tool_calls[0] - tc_id = tc.get("id", "call_0") - tc_name = tc["function"]["name"] - tc_args = json.loads(tc["function"]["arguments"]) - - print(f"\n[{timestamp()}] Step 2: Tool call received") - print(f" Name: {tc_name}") - print(f" Args: {tc_args}") - - # Simulate tool execution - tool_result = { - "location": tc_args.get("location", "Unknown"), - "temperature": "22°C", - "condition": "Partly cloudy", - "humidity": "65%", - "wind": "15 km/h NE" - } - - # Step 3: Send the tool response back - messages.append({ - "role": "assistant", - "tool_calls": tool_calls - }) - messages.append({ - "role": "tool", - "tool_call_id": tc_id, - "content": json.dumps(tool_result) - }) - - print(f"\n[{timestamp()}] Step 3: Sending tool response...") - print(f" Tool call ID: {tc_id}") - print(f" Tool result: {json.dumps(tool_result, indent=2)}") - - # Step 4: Get the model's follow-up response - if streaming: - final_response = "" - print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...") - - with client.stream( - "POST", - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": True, - "max_tokens": 512, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) as response: - for line in response.iter_lines(): - if not line or line == "data: [DONE]": - continue - - if line.startswith("data: "): - try: - chunk = json.loads(line[6:]) - if chunk.get("choices"): - delta = chunk["choices"][0].get("delta", {}) - if delta.get("content"): - content = delta["content"] - final_response += content - print(f"[{timestamp()}] Content: {content}", end="", flush=True) - except json.JSONDecodeError: - pass - - print() # newline after streaming output - else: - print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...") - - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 512, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - final_response = "" - if result.get("choices"): - final_response = result["choices"][0].get("message", {}).get("content", "") - - print(f"\n[{timestamp()}] Final response:\n{final_response}") - - # Check if the model used the tool data - success = True - issues = [] - - # The response should mention the weather data - if "22" not in final_response and "22°C" not in final_response: - issues.append("Temperature (22°C) not mentioned in response") - success = False - - if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower(): - issues.append("Condition (Partly cloudy) not mentioned in response") - success = False - - # Check for signs the model didn't see the data - blank_indicators = [ - "i don't have", - "i cannot access", - "i'm unable to", - "i am unable to", - "don't have access", - "don't have real-time", - "cannot provide real-time" - ] - - for indicator in blank_indicators: - if indicator in final_response.lower(): - issues.append(f"Model seems unaware of tool result (found: '{indicator}')") - success = False - break - - print(f"\n{'='*60}") - if success: - print("✓ PASS: Model correctly used tool response data") - else: - print("✗ FAIL: Model did not use tool response correctly") - for issue in issues: - print(f" - {issue}") - print(f"{'='*60}\n") - - return { - "success": success, - "issues": issues, - "final_response": final_response - } - - -def test_tool_response_with_debug_info(): - """ - Test with detailed logging to capture exactly what the model sees. - """ - - tools = [ - { - "type": "function", - "function": { - "name": "get_time", - "description": "Get the current time", - "parameters": { - "type": "object", - "properties": {}, - "required": [] - } - } - } - ] - - print(f"\n{'='*60}") - print(f"TEST: Tool response with debug info (non-streaming)") - print(f"{'='*60}\n") - - messages = [ - {"role": "user", "content": "What time is it?"} - ] - - with httpx.Client(timeout=120.0) as client: - # Get tool call - print(f"[{timestamp()}] Sending initial request...") - response = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "tool_choice": "auto", - "stream": False, - "max_tokens": 256, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result = response.json() - - if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"): - print("No tool call - skipping test") - return - - tool_call = result["choices"][0]["message"]["tool_calls"][0] - tc_id = tool_call["id"] - - print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}") - print(f"[{timestamp()}] Tool call ID: {tc_id}") - - # Add tool response - messages.append({ - "role": "assistant", - "tool_calls": [tool_call] - }) - messages.append({ - "role": "tool", - "tool_call_id": tc_id, - "content": "The current time is 3:45 PM on Thursday, April 9, 2026." - }) - - # Debug: print the full messages array we're about to send - print(f"\n[{timestamp()}] Sending follow-up with these messages:") - print(json.dumps(messages, indent=2)) - - # Get follow-up - response2 = client.post( - f"{API_BASE}/chat/completions", - headers={ - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json" - }, - json={ - "model": MODEL, - "messages": messages, - "tools": tools, - "stream": False, - "max_tokens": 256, - "chat_template_kwargs": {"enable_thinking": False}, - "logprobs": True, - "top_logprobs": 5 - } - ) - - result2 = response2.json() - print(f"\n[{timestamp()}] Full response:") - print(json.dumps(result2, indent=2)) - - if result2.get("choices"): - content = result2["choices"][0].get("message", {}).get("content", "") - - print(f"\n[{timestamp()}] Model response content: {content}") - - # Check if time is mentioned - if "3:45" in content or "3:45 PM" in content: - print("\n✓ Model used the tool response (time mentioned)") - else: - print("\n✗ Model may not have seen the tool response (time not mentioned)") - - -def main(): - print("\n" + "="*60) - print("GLM-5.1 Tool Call Response Tests") - print("="*60) - - # Test non-streaming first (simpler to debug) - print("\n--- Test 1: Non-streaming tool response flow ---") - test_tool_call_response_flow(streaming=False) - - # Test streaming - print("\n--- Test 2: Streaming tool response flow ---") - test_tool_call_response_flow(streaming=True) - - # Debug test - print("\n--- Test 3: Debug info test ---") - test_tool_response_with_debug_info() - - print("\nAll tests complete.") - - -if __name__ == "__main__": - main()