consolidate to run_suite.py: single pluggable test suite, all models 84/84

2026-04-12 21:59:03 +00:00
parent 2fa811b2e2
commit 1beaa23c58
7 changed files with 826 additions and 1661 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 .env
+models.env
+__pycache__/
--- a/run_suite.py
+++ b/run_suite.py
@@ -0,0 +1,815 @@
+#!/usr/bin/env python3
+"""
+Universal model tool-call test suite.
+
+Tests any OpenAI-compatible endpoint for:
+  1. Basic chat (non-streaming + streaming)
+  2. Tool calls (non-streaming + streaming)
+  3. Multi-turn tool response flow (non-streaming + streaming)
+  4. Nested/bad tool schema handling (SGLang compatibility)
+  5. Streaming tool call chunking (are args actually streamed?)
+  6. Param sweep (what vLLM params does the endpoint accept?)
+
+Handles reasoning models (content in 'reasoning' field, null 'content'),
+different finish_reason values, and empty/tool_calls arrays gracefully.
+
+Usage:
+  TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py
+  python3 run_suite.py --all
+  python3 run_suite.py --model 1
+  python3 run_suite.py --filter Devstral
+"""
+
+import os
+import sys
+import json
+import time
+import httpx
+import argparse
+from datetime import datetime
+from pathlib import Path
+from dataclasses import dataclass, field
+
+
+# ── Helpers ──────────────────────────────────────────────────
+
+def ts():
+    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
+
+
+def safe_choice(body: dict, index: int = 0) -> dict:
+    """Safely get a choice from a response body."""
+    choices = body.get("choices") or []
+    if index < len(choices):
+        return choices[index]
+    return {}
+
+
+def safe_message(body: dict) -> dict:
+    """Safely get the message from the first choice."""
+    return safe_choice(body).get("message") or {}
+
+
+def safe_delta(chunk: dict) -> dict:
+    """Safely get the delta from the first choice of a streaming chunk."""
+    choices = chunk.get("choices") or []
+    if choices:
+        return choices[0].get("delta") or {}
+    return {}
+
+
+def extract_content(msg: dict) -> tuple[str, str]:
+    """Extract (content, reasoning) from a message, handling nulls."""
+    content = msg.get("content") or ""
+    reasoning = msg.get("reasoning") or ""
+    return content, reasoning
+
+
+# ── Config ───────────────────────────────────────────────────
+
+@dataclass
+class ModelConfig:
+    api_base: str
+    api_key: str
+    model: str
+
+    @property
+    def label(self):
+        return self.model.split("/")[-1]
+
+
+def load_models_env(path: Path) -> list[ModelConfig]:
+    """Load models from the models.env file (pipe-delimited)."""
+    configs = []
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) >= 3:
+            configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2]))
+    return configs
+
+
+def config_from_env() -> ModelConfig | None:
+    """Get a single config from TOOLTEST_* environment variables."""
+    base = os.environ.get("TOOLTEST_API_BASE")
+    key = os.environ.get("TOOLTEST_API_KEY")
+    model = os.environ.get("TOOLTEST_MODEL")
+    if base and key and model:
+        return ModelConfig(api_base=base, api_key=key, model=model)
+    return None
+
+
+# ── Test result types ────────────────────────────────────────
+
+@dataclass
+class TestResult:
+    name: str
+    passed: bool
+    detail: str = ""
+    duration_s: float = 0.0
+
+
+@dataclass
+class SuiteResult:
+    model: str
+    results: list[TestResult] = field(default_factory=list)
+
+    @property
+    def passed(self):
+        return sum(1 for r in self.results if r.passed)
+
+    @property
+    def total(self):
+        return len(self.results)
+
+
+def make_client(cfg: ModelConfig) -> httpx.Client:
+    return httpx.Client(
+        timeout=120.0,
+        headers={
+            "Authorization": f"Bearer {cfg.api_key}",
+            "Content-Type": "application/json",
+        },
+    )
+
+
+# ── Shared tool definitions ──────────────────────────────────
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
+            },
+            "required": ["location"]
+        }
+    }
+}
+
+WRITE_FILE_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "write_file",
+        "description": "Write content to a file.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "filename": {"type": "string", "description": "Name of the file"},
+                "content": {"type": "string", "description": "The content to write"}
+            },
+            "required": ["filename", "content"]
+        }
+    }
+}
+
+BAD_SCHEMA_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "web_search",
+        "description": "Search the web",
+        "parameters": {
+            "type": "object",
+            "properties": []  # Invalid — should be {}
+        }
+    }
+}
+
+NESTED_BAD_SCHEMA_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "message",
+        "description": "Send a message",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "fields": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": []  # Invalid — should be {}
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+# ── Test functions ───────────────────────────────────────────
+
+def test_basic_nonstream(cfg: ModelConfig) -> TestResult:
+    """1. Basic non-streaming chat."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            r = c.post(f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "Say hello in one word."}],
+                "stream": False,
+                "max_tokens": 64,
+            })
+            body = r.json()
+            dur = time.time() - start
+            if r.status_code != 200:
+                return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
+            content, reasoning = extract_content(safe_message(body))
+            fr = safe_choice(body).get("finish_reason", "?")
+            if content:
+                return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur)
+            elif reasoning:
+                return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur)
+            else:
+                return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur)
+        except Exception as e:
+            return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start)
+
+
+def test_basic_stream(cfg: ModelConfig) -> TestResult:
+    """2. Basic streaming chat."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "Count from 1 to 5."}],
+                "stream": True,
+                "max_tokens": 64,
+            }) as r:
+                if r.status_code != 200:
+                    body = "".join(r.iter_lines())
+                    dur = time.time() - start
+                    return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur)
+                full_content = ""
+                full_reasoning = ""
+                for line in r.iter_lines():
+                    if not line or line == "data: [DONE]":
+                        continue
+                    if line.startswith("data: "):
+                        try:
+                            chunk = json.loads(line[6:])
+                            delta = safe_delta(chunk)
+                            if delta.get("content"):
+                                full_content += delta["content"]
+                            if delta.get("reasoning"):
+                                full_reasoning += delta["reasoning"]
+                        except json.JSONDecodeError:
+                            pass
+                dur = time.time() - start
+                if full_content:
+                    return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur)
+                elif full_reasoning:
+                    return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur)
+                else:
+                    return TestResult("basic stream", False, "No content or reasoning received", dur)
+        except Exception as e:
+            return TestResult("basic stream", False, f"Exception: {e}", time.time() - start)
+
+
+def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult:
+    """3. Tool call — non-streaming."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            r = c.post(f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
+                "tools": [WEATHER_TOOL],
+                "tool_choice": "auto",
+                "stream": False,
+                "max_tokens": 256,
+            })
+            body = r.json()
+            dur = time.time() - start
+            if r.status_code != 200:
+                return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
+            msg = safe_message(body)
+            tool_calls = msg.get("tool_calls") or []
+            if tool_calls:
+                tc = tool_calls[0]
+                fn = tc.get("function", {})
+                return TestResult("tool call non-stream", True,
+                    f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur)
+            else:
+                content, reasoning = extract_content(msg)
+                out = content or reasoning or "(empty)"
+                return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur)
+        except Exception as e:
+            return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start)
+
+
+def test_toolcall_stream(cfg: ModelConfig) -> TestResult:
+    """4. Tool call — streaming."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
+                "tools": [WEATHER_TOOL],
+                "tool_choice": "auto",
+                "stream": True,
+                "max_tokens": 256,
+            }) as r:
+                if r.status_code != 200:
+                    body = "".join(r.iter_lines())
+                    dur = time.time() - start
+                    return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur)
+                tool_name = None
+                accumulated_args = ""
+                content_parts = ""
+                reasoning_parts = ""
+                for line in r.iter_lines():
+                    if not line or line == "data: [DONE]":
+                        continue
+                    if line.startswith("data: "):
+                        try:
+                            chunk = json.loads(line[6:])
+                            delta = safe_delta(chunk)
+                            tc_list = delta.get("tool_calls") or []
+                            for tc in tc_list:
+                                fn = tc.get("function") or {}
+                                if fn.get("name"):
+                                    tool_name = fn["name"]
+                                if fn.get("arguments"):
+                                    accumulated_args += fn["arguments"]
+                            if delta.get("content"):
+                                content_parts += delta["content"]
+                            if delta.get("reasoning"):
+                                reasoning_parts += delta["reasoning"]
+                        except json.JSONDecodeError:
+                            pass
+                dur = time.time() - start
+                if tool_name:
+                    return TestResult("tool call stream", True,
+                        f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur)
+                else:
+                    out = content_parts or reasoning_parts or "(empty)"
+                    return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur)
+        except Exception as e:
+            return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start)
+
+
+def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult:
+    """5/6. Full tool call → response → follow-up flow."""
+    label = "tool response flow (stream)" if streaming else "tool response flow"
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
+
+            # Step 1: Get tool call
+            if not streaming:
+                r = c.post(f"{cfg.api_base}/chat/completions", json={
+                    "model": cfg.model,
+                    "messages": messages,
+                    "tools": [WEATHER_TOOL],
+                    "tool_choice": "auto",
+                    "stream": False,
+                    "max_tokens": 256,
+                })
+                body = r.json()
+                if r.status_code != 200:
+                    return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
+                msg = safe_message(body)
+            else:
+                tool_name = None
+                tool_id = None
+                accumulated_args = ""
+                with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
+                    "model": cfg.model,
+                    "messages": messages,
+                    "tools": [WEATHER_TOOL],
+                    "tool_choice": "auto",
+                    "stream": True,
+                    "max_tokens": 256,
+                }) as r:
+                    if r.status_code != 200:
+                        return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
+                    for line in r.iter_lines():
+                        if not line or line == "data: [DONE]":
+                            continue
+                        if line.startswith("data: "):
+                            try:
+                                chunk = json.loads(line[6:])
+                                delta = safe_delta(chunk)
+                                for tc in (delta.get("tool_calls") or []):
+                                    if tc.get("id"):
+                                        tool_id = tc["id"]
+                                    fn = tc.get("function") or {}
+                                    if fn.get("name"):
+                                        tool_name = fn["name"]
+                                    if fn.get("arguments"):
+                                        accumulated_args += fn["arguments"]
+                            except json.JSONDecodeError:
+                                pass
+                if not tool_name:
+                    return TestResult(label, False, "No tool call in step 1", time.time() - start)
+                msg = {
+                    "role": "assistant",
+                    "tool_calls": [{
+                        "id": tool_id or "call_0",
+                        "type": "function",
+                        "function": {"name": tool_name, "arguments": accumulated_args}
+                    }]
+                }
+
+            tool_calls = msg.get("tool_calls") or []
+            if not tool_calls:
+                return TestResult(label, False, "No tool call in step 1", time.time() - start)
+
+            tc = tool_calls[0]
+            tc_id = tc.get("id", "call_0")
+
+            # Step 2: Send tool response
+            messages.append(msg)
+            messages.append({
+                "role": "tool",
+                "tool_call_id": tc_id,
+                "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
+            })
+
+            # Step 3: Get follow-up
+            r2 = c.post(f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": messages,
+                "tools": [WEATHER_TOOL],
+                "stream": False,
+                "max_tokens": 256,
+            })
+            body2 = r2.json()
+            dur = time.time() - start
+            if r2.status_code != 200:
+                return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur)
+
+            final_msg = safe_message(body2)
+            final_content, final_reasoning = extract_content(final_msg)
+            final = final_content or final_reasoning or ""
+
+            # Check the model actually used the tool data
+            ok = "22" in final
+            indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"]
+            for ind in indicators:
+                if ind in final.lower():
+                    ok = False
+                    break
+            if not final_content and final_reasoning:
+                return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur)
+            return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur)
+        except Exception as e:
+            return TestResult(label, False, f"Exception: {e}", time.time() - start)
+
+
+def test_bad_tool_schema(cfg: ModelConfig) -> TestResult:
+    """7. OpenClaw-style tool with properties=[] (tests schema validation/middleware)."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            r = c.post(f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "Search for cats"}],
+                "tools": [BAD_SCHEMA_TOOL],
+                "tool_choice": "auto",
+                "stream": False,
+                "max_tokens": 128,
+            })
+            body = r.json()
+            dur = time.time() - start
+            if r.status_code != 200:
+                err = ""
+                try:
+                    err = body.get("error", {}).get("message", "")[:150]
+                except Exception:
+                    err = json.dumps(body)[:150]
+                return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
+            return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur)
+        except Exception as e:
+            return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start)
+
+
+def test_nested_bad_schema(cfg: ModelConfig) -> TestResult:
+    """8. Nested properties=[] inside items (the Tool 21 bug)."""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            r = c.post(f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{"role": "user", "content": "Send a message to Bob"}],
+                "tools": [NESTED_BAD_SCHEMA_TOOL],
+                "tool_choice": "auto",
+                "stream": False,
+                "max_tokens": 128,
+            })
+            body = r.json()
+            dur = time.time() - start
+            if r.status_code != 200:
+                err = ""
+                try:
+                    err = body.get("error", {}).get("message", "")[:150]
+                except Exception:
+                    err = json.dumps(body)[:150]
+                return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
+            return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur)
+        except Exception as e:
+            return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start)
+
+
+def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult:
+    """9. Streaming tool call chunking — are args actually streamed in multiple chunks?"""
+    with make_client(cfg) as c:
+        start = time.time()
+        try:
+            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
+                "model": cfg.model,
+                "messages": [{
+                    "role": "user",
+                    "content": "Write a Python hello world and save it using the write_file tool."
+                }],
+                "tools": [WRITE_FILE_TOOL],
+                "tool_choice": "auto",
+                "stream": True,
+                "max_tokens": 1024,
+            }) as r:
+                if r.status_code != 200:
+                    dur = time.time() - start
+                    return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur)
+
+                tool_name = None
+                arg_chunks = 0
+                accumulated_args = ""
+                content_chunks = 0
+                reasoning_chunks = 0
+                for line in r.iter_lines():
+                    if not line or line == "data: [DONE]":
+                        continue
+                    if line.startswith("data: "):
+                        try:
+                            chunk = json.loads(line[6:])
+                            delta = safe_delta(chunk)
+                            for tc in (delta.get("tool_calls") or []):
+                                fn = tc.get("function") or {}
+                                if fn.get("name"):
+                                    tool_name = fn["name"]
+                                if fn.get("arguments"):
+                                    arg_chunks += 1
+                                    accumulated_args += fn["arguments"]
+                            if delta.get("content"):
+                                content_chunks += 1
+                            if delta.get("reasoning"):
+                                reasoning_chunks += 1
+                        except json.JSONDecodeError:
+                            pass
+
+                dur = time.time() - start
+                if not tool_name:
+                    if content_chunks > 0 or reasoning_chunks > 0:
+                        return TestResult("streaming tool chunking", False,
+                            f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur)
+                    return TestResult("streaming tool chunking", False, "No tool call and no content", dur)
+
+                # Evaluate chunking quality
+                if arg_chunks > 1:
+                    return TestResult("streaming tool chunking", True,
+                        f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur)
+                elif arg_chunks == 1 and len(accumulated_args) > 500:
+                    return TestResult("streaming tool chunking", False,
+                        f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur)
+                elif arg_chunks == 1:
+                    return TestResult("streaming tool chunking", True,
+                        f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur)
+                else:
+                    return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur)
+        except Exception as e:
+            return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start)
+
+
+def test_param_sweep(cfg: ModelConfig) -> list[TestResult]:
+    """10. Parameter sweep — which vLLM params does the endpoint accept?"""
+    results = []
+    base_req = {
+        "model": cfg.model,
+        "messages": [{"role": "user", "content": "Say hi."}],
+        "stream": False,
+        "max_tokens": 32,
+    }
+    extra_params = [
+        ("chat_template_kwargs", {"enable_thinking": False}),
+        ("guided_json", None),
+        ("guided_regex", None),
+        ("response_format", {"type": "json_object"}),
+        ("n", 1),
+        ("presence_penalty", 0.0),
+        ("frequency_penalty", 0.0),
+        ("top_p", 1.0),
+        ("temperature", 0.7),
+        ("seed", 42),
+        ("stop", ["\n"]),
+        ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
+    ]
+
+    with make_client(cfg) as c:
+        for name, val in extra_params:
+            start = time.time()
+            try:
+                if isinstance(val, dict):
+                    req = {**base_req, **val}
+                else:
+                    req = {**base_req, name: val}
+                r = c.post(f"{cfg.api_base}/chat/completions", json=req)
+                dur = time.time() - start
+                ok = r.status_code == 200
+                detail = f"HTTP {r.status_code}"
+                if not ok:
+                    try:
+                        detail += f": {r.json().get('error', {}).get('message', '')[:80]}"
+                    except Exception:
+                        pass
+                results.append(TestResult(f"param: {name}", ok, detail, dur))
+            except Exception as e:
+                results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start))
+
+    return results
+
+
+# ── Suite runner ─────────────────────────────────────────────
+
+ALL_TESTS = [
+    test_basic_nonstream,
+    test_basic_stream,
+    test_toolcall_nonstream,
+    test_toolcall_stream,
+    lambda cfg: test_tool_response_flow(cfg, streaming=False),
+    lambda cfg: test_tool_response_flow(cfg, streaming=True),
+    test_bad_tool_schema,
+    test_nested_bad_schema,
+    test_streaming_tool_chunks,
+]
+
+
+def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult:
+    """Run the full test suite against one model config."""
+    result = SuiteResult(model=cfg.model)
+
+    print(f"\n{'='*60}")
+    print(f"Testing: {cfg.model}")
+    print(f"API: {cfg.api_base}")
+    print(f"{'='*60}")
+
+    for test_fn in ALL_TESTS:
+        name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__
+        if verbose:
+            print(f"\n[{ts()}] Running: {name}...")
+
+        tr = test_fn(cfg)
+        if isinstance(tr, list):
+            result.results.extend(tr)
+        else:
+            result.results.append(tr)
+
+        if verbose:
+            if isinstance(tr, list):
+                for r in tr:
+                    s = "✓" if r.passed else "✗"
+                    print(f"  {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
+            else:
+                s = "✓" if tr.passed else "✗"
+                print(f"  {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)")
+
+    # Param sweep
+    if verbose:
+        print(f"\n[{ts()}] Running: parameter sweep...")
+    sweep_results = test_param_sweep(cfg)
+    result.results.extend(sweep_results)
+    if verbose:
+        for r in sweep_results:
+            s = "✓" if r.passed else "✗"
+            print(f"  {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
+
+    return result
+
+
+def print_summary(results: list[SuiteResult]):
+    """Print a final summary across all models."""
+    print(f"\n\n{'='*60}")
+    print("FINAL SUMMARY")
+    print(f"{'='*60}")
+
+    for sr in results:
+        passed = sr.passed
+        total = sr.total
+        pct = (passed / total * 100) if total else 0
+        label = sr.model.split("/")[-1]
+        print(f"\n  {label}: {passed}/{total} passed ({pct:.0f}%)")
+
+        for r in sr.results:
+            if not r.passed:
+                print(f"    ✗ {r.name}: {r.detail[:80]}")
+
+    # Cross-model comparison for key tests
+    print(f"\n{'─'*60}")
+    print("CROSS-MODEL COMPARISON")
+    print(f"{'─'*60}")
+    key_tests = [
+        "basic non-stream",
+        "basic stream",
+        "tool call non-stream",
+        "tool call stream",
+        "tool response flow",
+        "tool response flow (stream)",
+        "streaming tool chunking",
+        "bad tool schema (properties=[])",
+        "nested bad schema (items.properties=[])",
+    ]
+
+    # Calculate column width
+    labels = [sr.model.split("/")[-1][:18] for sr in results]
+    col_w = max(len(l) for l in labels) if labels else 16
+    col_w = max(col_w, 16)
+
+    header = f"{'Test':<40}"
+    for l in labels:
+        header += f" {l:>{col_w}}"
+    print(header)
+    print("─" * len(header))
+
+    for test_name in key_tests:
+        row = f"{test_name:<40}"
+        for sr in results:
+            match = [r for r in sr.results if r.name == test_name]
+            if match:
+                status = "✓" if match[0].passed else "✗"
+                row += f" {status:>{col_w}}"
+            else:
+                row += f" {'—':>{col_w}}"
+        print(row)
+
+    print(f"\n{'='*60}")
+
+
+# ── CLI ──────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Universal model tool-call test suite")
+    parser.add_argument("--all", action="store_true", help="Test all models from models.env")
+    parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env")
+    parser.add_argument("--filter", type=str, help="Test models matching substring")
+    parser.add_argument("--quiet", action="store_true", help="Less output per test")
+    args = parser.parse_args()
+
+    models_path = Path(__file__).parent / "models.env"
+
+    configs: list[ModelConfig] = []
+
+    if args.all:
+        if not models_path.exists():
+            print("ERROR: models.env not found")
+            sys.exit(1)
+        configs = load_models_env(models_path)
+    elif args.model:
+        if not models_path.exists():
+            print("ERROR: models.env not found")
+            sys.exit(1)
+        all_configs = load_models_env(models_path)
+        if args.model < 1 or args.model > len(all_configs):
+            print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})")
+            sys.exit(1)
+        configs = [all_configs[args.model - 1]]
+    elif args.filter:
+        if not models_path.exists():
+            print("ERROR: models.env not found")
+            sys.exit(1)
+        all_configs = load_models_env(models_path)
+        configs = [c for c in all_configs if args.filter.lower() in c.model.lower()]
+        if not configs:
+            print(f"No models matching '{args.filter}'")
+            sys.exit(1)
+    else:
+        cfg = config_from_env()
+        if cfg:
+            configs = [cfg]
+        else:
+            print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.")
+            if models_path.exists():
+                print("\nAvailable models from models.env:")
+                for i, c in enumerate(load_models_env(models_path), 1):
+                    print(f"  {i}. {c.model} @ {c.api_base}")
+            sys.exit(1)
+
+    all_results: list[SuiteResult] = []
+    for cfg in configs:
+        sr = run_suite(cfg, verbose=not args.quiet)
+        all_results.append(sr)
+
+    print_summary(all_results)
+
+    if any(sr.passed < sr.total for sr in all_results):
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,19 +1,14 @@
-#!/bin/bash
-# Run the streaming tool call tests
-
+#!/usr/bin/env bash
 set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-# Default values
-export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}"
-export VLLM_API_KEY="${VLLM_API_KEY:-none}"
-export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}"
+# Usage:
+#   ./run_tests.sh                # Test all models from models.env
+#   ./run_tests.sh --model 1      # Test model #1
+#   ./run_tests.sh --filter Devstral  # Test matching models
+#   ./run_tests.sh --all          # Same as no args
+#   ./run_tests.sh --quiet        # Less output

-echo "Configuration:"
-echo "  API_BASE: $VLLM_API_BASE"
-echo "  MODEL: $VLLM_MODEL"
-echo ""
-
-# Run the test
-python3 "$SCRIPT_DIR/test_streaming_tool_calls.py"
+cd "$SCRIPT_DIR"
+python3 -u run_suite.py "$@"
--- a/test_devstral.py
+++ b/test_devstral.py
@@ -1,546 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
-
-These tests send EXACTLY what OpenClaw would send to vLLM — including
-chat_template_kwargs, logprobs, weird tool schemas, the works.
-The middleware's job is to strip/fix all of it so SGLang doesn't choke.
-
-Architecture:  this test → middleware (strips bad params) → SGLang
-"""
-
-import os
-import time
-import json
-import httpx
-from datetime import datetime
-from pathlib import Path
-
-# Load .env if present (don't hardcode keys)
-_env_file = Path(__file__).parent / ".env"
-if _env_file.exists():
-    for line in _env_file.read_text().splitlines():
-        line = line.strip()
-        if not line or line.startswith("#") or "=" not in line:
-            continue
-        k, v = line.split("=", 1)
-        os.environ.setdefault(k.strip(), v.strip())
-
-API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
-API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
-MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
-
-RESULTS = []
-
-
-def ts():
-    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
-
-
-def record(name, ok, detail=""):
-    status = "✓ PASS" if ok else "✗ FAIL"
-    print(f"\n{status}: {name}")
-    if detail:
-        print(f"  {detail}")
-    RESULTS.append({"name": name, "pass": ok, "detail": detail})
-
-
-def make_client():
-    return httpx.Client(
-        timeout=120.0,
-        headers={
-            "Authorization": f"Bearer {API_KEY}",
-            "Content-Type": "application/json",
-        },
-    )
-
-
-# ── 1. Basic non-streaming chat ──────────────────────────────
-
-def test_basic_nonstream():
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Basic non-streaming chat")
-    print(f"{'='*60}")
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "Say hello in one word."}],
-            "stream": False,
-            "max_tokens": 32,
-        })
-        print(f"[{ts()}] Status: {r.status_code}")
-        body = r.json()
-        if r.status_code != 200:
-            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
-            record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
-            return
-        content = body["choices"][0]["message"]["content"]
-        print(f"[{ts()}] Reply: {content[:100]}")
-        record("basic non-stream", True, f"Got: {content[:80]}")
-
-
-# ── 2. Basic streaming chat ──────────────────────────────────
-
-def test_basic_stream():
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Basic streaming chat")
-    print(f"{'='*60}")
-
-    with make_client() as c:
-        with c.stream("POST", f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "Count from 1 to 5."}],
-            "stream": True,
-            "max_tokens": 64,
-        }) as r:
-            print(f"[{ts()}] Status: {r.status_code}")
-            if r.status_code != 200:
-                body = "".join(r.iter_lines())
-                print(f"[{ts()}] Error: {body[:300]}")
-                record("basic stream", False, f"HTTP {r.status_code}")
-                return
-            full = ""
-            for line in r.iter_lines():
-                if not line or line == "data: [DONE]":
-                    continue
-                if line.startswith("data: "):
-                    try:
-                        chunk = json.loads(line[6:])
-                        if not chunk.get("choices"): continue
-                        delta = chunk["choices"][0].get("delta", {})
-                        if delta.get("content"):
-                            full += delta["content"]
-                    except json.JSONDecodeError:
-                        pass
-            print(f"[{ts()}] Reply: {full[:100]}")
-            record("basic stream", True, f"Got: {full[:80]}")
-
-
-# ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
-
-def test_toolcall_nonstream():
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
-    print(f"{'='*60}")
-
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
-                },
-                "required": ["location"]
-            }
-        }
-    }]
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": False,
-            "max_tokens": 256,
-        })
-        print(f"[{ts()}] Status: {r.status_code}")
-        body = r.json()
-        if r.status_code != 200:
-            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
-            record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
-            return
-        msg = body["choices"][0]["message"]
-        if msg.get("tool_calls"):
-            tc = msg["tool_calls"][0]
-            print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
-            record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
-        else:
-            content = msg.get("content", "")
-            print(f"[{ts()}] No tool call. Content: {content[:200]}")
-            record("tool call non-stream", False, "Model did not call the tool")
-
-
-# ── 4. Tool call — streaming ────────────────────────────────
-
-def test_toolcall_stream():
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Tool call streaming")
-    print(f"{'='*60}")
-
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
-                },
-                "required": ["location"]
-            }
-        }
-    }]
-
-    with make_client() as c:
-        with c.stream("POST", f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": True,
-            "max_tokens": 256,
-        }) as r:
-            print(f"[{ts()}] Status: {r.status_code}")
-            if r.status_code != 200:
-                body = "".join(r.iter_lines())
-                print(f"[{ts()}] Error: {body[:300]}")
-                record("tool call stream", False, f"HTTP {r.status_code}")
-                return
-            tool_name = None
-            accumulated_args = ""
-            content_parts = ""
-            for line in r.iter_lines():
-                if not line or line == "data: [DONE]":
-                    continue
-                if line.startswith("data: "):
-                    try:
-                        chunk = json.loads(line[6:])
-                        if not chunk.get("choices"): continue
-                        delta = chunk["choices"][0].get("delta", {})
-                        if delta.get("tool_calls"):
-                            for tc in delta["tool_calls"]:
-                                if tc.get("function", {}).get("name"):
-                                    tool_name = tc["function"]["name"]
-                                if tc.get("function", {}).get("arguments"):
-                                    accumulated_args += tc["function"]["arguments"]
-                        if delta.get("content"):
-                            content_parts += delta["content"]
-                    except json.JSONDecodeError:
-                        pass
-
-            if tool_name:
-                print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
-                record("tool call stream", True, f"Got tool call: {tool_name}")
-            else:
-                print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
-                record("tool call stream", False, "Model did not call the tool")
-
-
-# ── 5. Full tool response flow (non-streaming) ──────────────
-
-def test_tool_response_flow():
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
-    print(f"{'='*60}")
-
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
-                },
-                "required": ["location"]
-            }
-        }
-    }]
-
-    messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": messages,
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": False,
-            "max_tokens": 256,
-        })
-        body = r.json()
-        if r.status_code != 200:
-            record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
-            return
-        msg = body["choices"][0]["message"]
-        if not msg.get("tool_calls"):
-            record("tool response flow", False, "No tool call in step 1")
-            return
-
-        tc = msg["tool_calls"][0]
-        tc_id = tc["id"]
-        print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
-
-        messages.append(msg)
-        messages.append({
-            "role": "tool",
-            "tool_call_id": tc_id,
-            "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
-        })
-
-        r2 = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": messages,
-            "tools": tools,
-            "stream": False,
-            "max_tokens": 256,
-        })
-        body2 = r2.json()
-        if r2.status_code != 200:
-            print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
-            record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
-            return
-
-        final = body2["choices"][0]["message"].get("content", "")
-        print(f"[{ts()}] Final: {final[:200]}")
-        ok = "22" in final
-        record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
-
-
-# ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
-
-def test_param_sweep():
-    """
-    Sends EVERY param that OpenClaw or vLLM might include.
-    The middleware must strip/fix the ones SGLang rejects.
-    """
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
-    print(f"{'='*60}")
-
-    base_req = {
-        "model": MODEL,
-        "messages": [{"role": "user", "content": "Say hi."}],
-        "stream": False,
-        "max_tokens": 32,
-    }
-
-    # Params that OpenClaw/vLLM might send — some SGLang rejects
-    extra_params = [
-        ("chat_template_kwargs", {"enable_thinking": False}),
-        ("guided_json", None),
-        ("guided_regex", None),
-        ("response_format", {"type": "json_object"}),
-        ("n", 1),
-        ("presence_penalty", 0.0),
-        ("frequency_penalty", 0.0),
-        ("top_p", 1.0),
-        ("temperature", 0.7),
-        ("seed", 42),
-        ("stop", ["\n"]),
-        ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
-        ("top_logprobs", 5),
-    ]
-
-    with make_client() as c:
-        # baseline
-        r = c.post(f"{API_BASE}/chat/completions", json=base_req)
-        print(f"[{ts()}] Baseline: {r.status_code}")
-
-        for name, val in extra_params:
-            req = {**base_req, name: val}
-            r = c.post(f"{API_BASE}/chat/completions", json=req)
-            status = "✓" if r.status_code == 200 else "✗"
-            detail = ""
-            if r.status_code != 200:
-                try:
-                    detail = r.json().get("error", {}).get("message", "")[:100]
-                except Exception:
-                    detail = r.text[:100]
-            print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
-            if r.status_code != 200:
-                record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
-
-
-# ── 7. OpenClaw-style tool schema (the one that caused 400) ─
-
-def test_openclaw_tool_schema():
-    """
-    Reproduce the exact tool schema that OpenClaw sends which has
-    parameters.properties = [] instead of {}. Middleware must fix it.
-    """
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
-    print(f"{'='*60}")
-
-    # This is the exact shape OpenClaw sends for tools with no params
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "web_search",
-            "description": "Search the web",
-            "parameters": {
-                "type": "object",
-                "properties": []  # <-- THIS is what causes the 400
-            }
-        }
-    }]
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "Search for cats"}],
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": False,
-            "max_tokens": 128,
-        })
-        print(f"[{ts()}] Status: {r.status_code}")
-        body = r.json()
-        if r.status_code != 200:
-            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
-            record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
-            return
-        print(f"[{ts()}] Success — middleware fixed the bad schema")
-        record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
-
-
-# ── 8. Nested properties=[] in tool schema (Tool 21 bug) ────
-
-def test_nested_bad_properties():
-    """
-    Reproduce the exact Tool 21 400 error:
-    schema['properties']['fields']['items']['properties'] = []
-    
-    This happens when a tool has an array-of-objects parameter where
-    the items' properties field is [] instead of {}. The middleware
-    must recurse into the schema to fix ALL properties fields.
-    """
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)")
-    print(f"{'='*60}")
-
-    # This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema:
-    # [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']"
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "message",
-            "description": "Send a message",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "fields": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "properties": []  # <-- THIS causes the 400
-                        }
-                    }
-                }
-            }
-        }
-    }]
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [{"role": "user", "content": "Send a message to Bob"}],
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": False,
-            "max_tokens": 128,
-        })
-        print(f"[{ts()}] Status: {r.status_code}")
-        body = r.json()
-        if r.status_code != 200:
-            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}")
-            record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
-            return
-        print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}")
-        record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}")
-
-
-# ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─
-
-def test_openclaw_full_payload():
-    """
-    The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
-    Exactly what OpenClaw sends through the pipe.
-    """
-    print(f"\n{'='*60}")
-    print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
-    print(f"{'='*60}")
-
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "web_search",
-            "description": "Search the web using DuckDuckGo.",
-            "parameters": {
-                "type": "object",
-                "properties": []  # Bad — middleware must fix
-            }
-        }
-    }]
-
-    with make_client() as c:
-        r = c.post(f"{API_BASE}/chat/completions", json={
-            "model": MODEL,
-            "messages": [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "Search for the weather in NYC"},
-            ],
-            "tools": tools,
-            "tool_choice": "auto",
-            "stream": False,
-            "max_tokens": 256,
-            "chat_template_kwargs": {"enable_thinking": False},  # Bad — middleware must strip
-            "logprobs": True,                                      # Bad — middleware must strip
-            "top_logprobs": 5,                                     # Bad — middleware must strip
-        })
-        print(f"[{ts()}] Status: {r.status_code}")
-        body = r.json()
-        if r.status_code != 200:
-            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
-            record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
-            return
-        msg = body["choices"][0]["message"]
-        print(f"[{ts()}] Success — middleware cleaned everything")
-        if msg.get("tool_calls"):
-            tc = msg["tool_calls"][0]
-            print(f"[{ts()}] Tool call: {tc['function']['name']}")
-        else:
-            print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
-        record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
-
-
-# ── Main ─────────────────────────────────────────────────────
-
-def main():
-    print(f"\n{'='*60}")
-    print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
-    print(f"API: {API_BASE}")
-    print(f"Model: {MODEL}")
-    print(f"{'='*60}")
-
-    test_basic_nonstream()
-    test_basic_stream()
-    test_toolcall_nonstream()
-    test_toolcall_stream()
-    test_tool_response_flow()
-    test_param_sweep()
-    test_openclaw_tool_schema()
-    test_nested_bad_properties()
-    test_openclaw_full_payload()
-
-    print(f"\n\n{'='*60}")
-    print("FINAL RESULTS")
-    print(f"{'='*60}")
-    for r in RESULTS:
-        s = "✓" if r["pass"] else "✗"
-        print(f"  {s} {r['name']}: {r['detail']}")
-    passed = sum(1 for r in RESULTS if r["pass"])
-    print(f"\n  {passed}/{len(RESULTS)} passed")
-    print(f"{'='*60}")
-
-
-if __name__ == "__main__":
-    main()
--- a/test_streaming_tool_calls.py
+++ b/test_streaming_tool_calls.py
@@ -1,395 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test suite for vLLM GLM-5.1 streaming tool calls.
-
-Reproduces the issue where long string parameters in tool calls
-are buffered entirely before being emitted during streaming.
-"""
-
-import os
-import time
-import json
-import httpx
-from datetime import datetime
-
-
-# Configuration - will be set via environment or direct assignment
-API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
-API_KEY = os.environ.get("VLLM_API_KEY", "none")
-MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
-
-
-def timestamp():
-    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
-
-
-def test_streaming_tool_call_with_code():
-    """
-    Test streaming a tool call with a long string parameter.
-    
-    This prompts the model to generate code via a tool call,
-    which should stream incrementally if the patch works correctly.
-    """
-    
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "write_file",
-                "description": "Write content to a file. Use this to save code, text, or other content.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "filename": {
-                            "type": "string",
-                            "description": "Name of the file to write"
-                        },
-                        "content": {
-                            "type": "string",
-                            "description": "The content to write to the file"
-                        }
-                    },
-                    "required": ["filename", "content"]
-                }
-            }
-        }
-    ]
-    
-    messages = [
-        {
-            "role": "user",
-            "content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool."
-        }
-    ]
-    
-    print(f"\n{'='*60}")
-    print(f"TEST: Streaming tool call with long string parameter")
-    print(f"API: {API_BASE}")
-    print(f"Model: {MODEL}")
-    print(f"{'='*60}\n")
-    
-    # Track streaming events
-    chunks_received = []
-    first_chunk_time = None
-    last_chunk_time = None
-    tool_call_chunks = []
-    accumulated_content = ""
-    
-    start_time = time.time()
-    
-    with httpx.Client(timeout=120.0) as client:
-        with client.stream(
-            "POST",
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "tool_choice": "auto",
-                "stream": True,
-                "max_tokens": 4096,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        ) as response:
-            print(f"[{timestamp()}] Response status: {response.status_code}")
-            
-            for line in response.iter_lines():
-                if not line or line == "data: [DONE]":
-                    continue
-                    
-                if line.startswith("data: "):
-                    chunk_data = line[6:]
-                    try:
-                        chunk = json.loads(chunk_data)
-                        
-                        if first_chunk_time is None:
-                            first_chunk_time = time.time()
-                            print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)")
-                        
-                        last_chunk_time = time.time()
-                        chunks_received.append(chunk)
-                        
-                        # Extract delta content
-                        if chunk.get("choices"):
-                            delta = chunk["choices"][0].get("delta", {})
-                            
-                            # Check for tool calls in delta
-                            if delta.get("tool_calls"):
-                                for tc in delta["tool_calls"]:
-                                    tc_index = tc.get("index", 0)
-                                    tc_function = tc.get("function", {})
-                                    
-                                    if tc_function.get("name"):
-                                        print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}")
-                                    
-                                    if tc_function.get("arguments"):
-                                        args_chunk = tc_function["arguments"]
-                                        tool_call_chunks.append(args_chunk)
-                                        accumulated_content += args_chunk
-                                        
-                                        # Print progress every ~500 chars
-                                        if len(accumulated_content) % 500 < len(args_chunk):
-                                            print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...")
-                            
-                            # Regular content
-                            if delta.get("content"):
-                                print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...")
-                                
-                    except json.JSONDecodeError as e:
-                        print(f"[{timestamp()}] JSON decode error: {e}")
-    
-    end_time = time.time()
-    
-    # Summary
-    print(f"\n{'='*60}")
-    print("SUMMARY")
-    print(f"{'='*60}")
-    print(f"Total chunks received: {len(chunks_received)}")
-    print(f"Total time: {end_time - start_time:.3f}s")
-    
-    if first_chunk_time:
-        print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s")
-    
-    if tool_call_chunks:
-        print(f"Tool call chunks: {len(tool_call_chunks)}")
-        print(f"Total tool call content: {len(accumulated_content)} chars")
-        
-        # Try to parse the accumulated arguments
-        print(f"\nAttempting to parse tool call arguments...")
-        try:
-            args = json.loads(accumulated_content)
-            print(f"Successfully parsed!")
-            print(f"  - filename: {args.get('filename', 'N/A')}")
-            print(f"  - content length: {len(args.get('content', ''))} chars")
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse: {e}")
-            print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}")
-    
-    # Verdict
-    print(f"\n{'='*60}")
-    if len(tool_call_chunks) > 1:
-        print("✓ PASS: Tool call arguments arrived in multiple chunks")
-        print(f"  Chunks: {len(tool_call_chunks)}, indicating incremental streaming")
-    elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000:
-        print("✗ FAIL: Tool call arguments arrived in a single chunk")
-        print("  This indicates buffering, not true streaming")
-    else:
-        print("? INCONCLUSIVE: Not enough data or no tool call occurred")
-    print(f"{'='*60}\n")
-    
-    return {
-        "chunks_received": len(chunks_received),
-        "tool_call_chunks": len(tool_call_chunks),
-        "accumulated_length": len(accumulated_content),
-        "total_time": end_time - start_time
-    }
-
-
-def test_streaming_tool_call_with_json():
-    """
-    Test streaming a tool call that returns structured JSON data.
-    """
-    
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "save_config",
-                "description": "Save a configuration object",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "config": {
-                            "type": "object",
-                            "description": "Configuration object with many fields"
-                        }
-                    },
-                    "required": ["config"]
-                }
-            }
-        }
-    ]
-    
-    messages = [
-        {
-            "role": "user",
-            "content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool."
-        }
-    ]
-    
-    print(f"\n{'='*60}")
-    print(f"TEST: Streaming tool call with nested JSON")
-    print(f"{'='*60}\n")
-    
-    tool_call_chunks = []
-    accumulated_content = ""
-    start_time = time.time()
-    
-    with httpx.Client(timeout=120.0) as client:
-        with client.stream(
-            "POST",
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "tool_choice": "auto",
-                "stream": True,
-                "max_tokens": 2048,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        ) as response:
-            for line in response.iter_lines():
-                if not line or line == "data: [DONE]":
-                    continue
-                    
-                if line.startswith("data: "):
-                    try:
-                        chunk = json.loads(line[6:])
-                        if chunk.get("choices"):
-                            delta = chunk["choices"][0].get("delta", {})
-                            if delta.get("tool_calls"):
-                                for tc in delta["tool_calls"]:
-                                    if tc.get("function", {}).get("arguments"):
-                                        args_chunk = tc["function"]["arguments"]
-                                        tool_call_chunks.append(args_chunk)
-                                        accumulated_content += args_chunk
-                                        print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})")
-                    except json.JSONDecodeError:
-                        pass
-    
-    end_time = time.time()
-    
-    print(f"\n{'='*60}")
-    print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars")
-    print(f"Time: {end_time - start_time:.3f}s")
-    
-    if len(tool_call_chunks) > 1:
-        print("✓ PASS: Arguments streamed in multiple chunks")
-    elif len(tool_call_chunks) == 1:
-        print("✗ FAIL: Arguments arrived in single chunk (buffered)")
-    else:
-        print("? No tool call occurred")
-    print(f"{'='*60}\n")
-
-
-def test_non_streaming_tool_call():
-    """
-    Baseline test: non-streaming tool call for comparison.
-    """
-    
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "write_file",
-                "description": "Write content to a file",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "filename": {"type": "string"},
-                        "content": {"type": "string"}
-                    },
-                    "required": ["filename", "content"]
-                }
-            }
-        }
-    ]
-    
-    messages = [
-        {
-            "role": "user",
-            "content": "Write a simple Python hello world and save it using the write_file tool."
-        }
-    ]
-    
-    print(f"\n{'='*60}")
-    print(f"TEST: Non-streaming tool call (baseline)")
-    print(f"{'='*60}\n")
-    
-    start_time = time.time()
-    
-    with httpx.Client(timeout=120.0) as client:
-        response = client.post(
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "tool_choice": "auto",
-                "stream": False,
-                "max_tokens": 1024,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        )
-        
-        result = response.json()
-        end_time = time.time()
-        
-        print(f"Status: {response.status_code}")
-        print(f"Time: {end_time - start_time:.3f}s")
-        
-        if result.get("choices"):
-            message = result["choices"][0].get("message", {})
-            if message.get("tool_calls"):
-                for tc in message["tool_calls"]:
-                    print(f"Tool: {tc['function']['name']}")
-                    args = json.loads(tc["function"]["arguments"])
-                    print(f"Arguments parsed successfully")
-                    print(f"  - filename: {args.get('filename')}")
-                    print(f"  - content length: {len(args.get('content', ''))}")
-            else:
-                print("No tool call in response")
-    
-    print(f"{'='*60}\n")
-
-
-def main():
-    print("\n" + "="*60)
-    print("vLLM GLM-5.1 Streaming Tool Call Tests")
-    print("="*60)
-    
-    # Check API connectivity
-    print(f"\nChecking API at {API_BASE}...")
-    try:
-        with httpx.Client(timeout=10.0) as client:
-            response = client.get(f"{API_BASE.replace('/v1', '')}/health")
-            print(f"Health check: {response.status_code}")
-    except Exception as e:
-        print(f"Warning: Could not reach API - {e}")
-    
-    # Run tests
-    print("\nRunning tests...\n")
-    
-    # Test 1: Non-streaming baseline
-    test_non_streaming_tool_call()
-    
-    # Test 2: Streaming with nested JSON
-    test_streaming_tool_call_with_json()
-    
-    # Test 3: Main test - streaming with long code
-    result = test_streaming_tool_call_with_code()
-    
-    print("\nAll tests complete.")
-
-
-if __name__ == "__main__":
-    main()
--- a/test_tool_diagnosis.py
+++ b/test_tool_diagnosis.py
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-"""
-Focused test to diagnose GLM-5.1 tool response issue.
-
-The issue: Model sees tool response as blank.
-"""
-
-import httpx
-import json
-
-API_BASE = "http://95.179.247.150/v1"
-API_KEY = "whatever"
-MODEL = "HuggingFaceTB/SmolLM3-3B"
-
-
-def test_simple_tool_response():
-    """
-    Minimal test: Send a tool response and see if the model can use it.
-    """
-    
-    # Simulate a conversation where a tool was called
-    messages = [
-        {"role": "user", "content": "Call the test function"},
-        {
-            "role": "assistant",
-            "tool_calls": [{
-                "id": "call_123",
-                "type": "function",
-                "function": {"name": "test_func", "arguments": "{}"}
-            }]
-        },
-        {
-            "role": "tool",
-            "tool_call_id": "call_123",
-            "content": "SUCCESS: The function returned value 42"
-        }
-    ]
-    
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "test_func",
-            "description": "A test function",
-            "parameters": {"type": "object", "properties": {}}
-        }
-    }]
-    
-    print("=" * 60)
-    print("Request messages:")
-    print(json.dumps(messages, indent=2))
-    print("=" * 60)
-    
-    with httpx.Client(timeout=60.0) as client:
-        # Non-streaming to get full response
-        response = client.post(
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "stream": False,
-                "max_tokens": 256,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        )
-        
-        result = response.json()
-        
-        print("\nFull response:")
-        print(json.dumps(result, indent=2))
-        
-        if result.get("choices"):
-            content = result["choices"][0].get("message", {}).get("content", "")
-            print("\n" + "=" * 60)
-            print("Model response content:")
-            print(content)
-            print("=" * 60)
-            
-            # Check if the tool result is referenced
-            if "42" in content:
-                print("\n✓ PASS: Model referenced the tool result (42)")
-            else:
-                print("\n✗ FAIL: Model did NOT reference the tool result (42)")
-                
-            # Check for signs the model didn't see the result
-            if "don't have" in content.lower() or "cannot access" in content.lower():
-                print("✗ Model indicates it cannot see tool result")
-
-
-def test_without_tools_param():
-    """
-    Test what happens if we don't pass tools in the follow-up request.
-    Some APIs need tools to be passed on every request.
-    """
-    
-    messages = [
-        {"role": "user", "content": "Call the test function"},
-        {
-            "role": "assistant",
-            "tool_calls": [{
-                "id": "call_123",
-                "type": "function",
-                "function": {"name": "test_func", "arguments": "{}"}
-            }]
-        },
-        {
-            "role": "tool",
-            "tool_call_id": "call_123",
-            "content": "SUCCESS: The function returned value 42"
-        }
-    ]
-    
-    print("\n" + "=" * 60)
-    print("Test WITHOUT tools param in follow-up")
-    print("=" * 60)
-    
-    with httpx.Client(timeout=60.0) as client:
-        response = client.post(
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                # No tools param
-                "stream": False,
-                "max_tokens": 256,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        )
-        
-        result = response.json()
-        
-        if result.get("choices"):
-            content = result["choices"][0].get("message", {}).get("content", "")
-            print("Model response:", content[:200])
-            
-            if "42" in content:
-                print("✓ Model referenced the tool result")
-
-
-def test_different_content_formats():
-    """
-    Test if the issue is with how content is formatted.
-    """
-    
-    # Test 1: String content (standard)
-    messages_string = [
-        {"role": "user", "content": "What is 2+2?"},
-        {
-            "role": "assistant",
-            "tool_calls": [{
-                "id": "call_123",
-                "type": "function",
-                "function": {"name": "calc", "arguments": "{}"}
-            }]
-        },
-        {
-            "role": "tool",
-            "tool_call_id": "call_123",
-            "content": "The answer is 4"
-        }
-    ]
-    
-    # Test 2: Content as array (OpenAI format)
-    messages_array = [
-        {"role": "user", "content": "What is 2+2?"},
-        {
-            "role": "assistant",
-            "tool_calls": [{
-                "id": "call_123",
-                "type": "function",
-                "function": {"name": "calc", "arguments": "{}"}
-            }]
-        },
-        {
-            "role": "tool",
-            "tool_call_id": "call_123",
-            "content": [{"type": "text", "text": "The answer is 4"}]
-        }
-    ]
-    
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "calc",
-            "description": "Calculator",
-            "parameters": {"type": "object", "properties": {}}
-        }
-    }]
-    
-    print("\n" + "=" * 60)
-    print("Test: String content vs Array content")
-    print("=" * 60)
-    
-    with httpx.Client(timeout=60.0) as client:
-        for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
-            print(f"\n--- {name} ---")
-            response = client.post(
-                f"{API_BASE}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {API_KEY}",
-                    "Content-Type": "application/json"
-                },
-                json={
-                    "model": MODEL,
-                    "messages": msgs,
-                    "tools": tools,
-                    "stream": False,
-                    "max_tokens": 128,
-                    "chat_template_kwargs": {"enable_thinking": False},
-                    "logprobs": True,
-                    "top_logprobs": 5
-                }
-            )
-            
-            result = response.json()
-            if result.get("choices"):
-                content = result["choices"][0].get("message", {}).get("content", "")
-                print(f"Response: {content[:150]}")
-                if "4" in content:
-                    print("✓ Referenced tool result")
-                else:
-                    print("✗ Did NOT reference tool result")
-
-
-if __name__ == "__main__":
-    print("GLM-5.1 Tool Response Diagnosis")
-    print("=" * 60)
-    
-    test_simple_tool_response()
-    test_without_tools_param()
-    test_different_content_formats()
--- a/test_tool_response.py
+++ b/test_tool_response.py
@@ -1,463 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test for tool call response handling in GLM-5.1.
-
-Tests the multi-turn flow:
-1. Send a prompt that triggers a tool call
-2. Send back the tool result
-3. Verify the model can see and use the tool response
-
-This reproduces the issue where tool responses appear blank to the model.
-"""
-
-import os
-import json
-import httpx
-from datetime import datetime
-
-
-API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
-API_KEY = os.environ.get("VLLM_API_KEY", "none")
-MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
-
-
-def timestamp():
-    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
-
-
-def test_tool_call_response_flow(streaming: bool = True):
-    """
-    Test the full tool call -> response -> follow-up flow.
-    
-    This simulates:
-    1. User asks for weather
-    2. Model calls get_weather tool
-    3. We send back the weather data
-    4. Model should see and use that data
-    """
-    
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "City and state, e.g. 'New York, NY'"
-                        }
-                    },
-                    "required": ["location"]
-                }
-            }
-        }
-    ]
-    
-    # Initial request that should trigger a tool call
-    messages = [
-        {
-            "role": "user",
-            "content": "What's the weather like in Tokyo right now?"
-        }
-    ]
-    
-    mode = "STREAMING" if streaming else "NON-STREAMING"
-    print(f"\n{'='*60}")
-    print(f"TEST: Tool call response flow ({mode})")
-    print(f"API: {API_BASE}")
-    print(f"Model: {MODEL}")
-    print(f"{'='*60}\n")
-    
-    with httpx.Client(timeout=120.0) as client:
-        # Step 1: Send initial request, expect tool call
-        print(f"[{timestamp()}] Step 1: Sending initial request...")
-        
-        if streaming:
-            tool_calls = []
-            tool_call_id = None
-            tool_call_name = None
-            accumulated_args = ""
-            
-            with client.stream(
-                "POST",
-                f"{API_BASE}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {API_KEY}",
-                    "Content-Type": "application/json"
-                },
-                json={
-                    "model": MODEL,
-                    "messages": messages,
-                    "tools": tools,
-                    "tool_choice": "auto",
-                    "stream": True,
-                    "max_tokens": 512,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-                }
-            ) as response:
-                print(f"[{timestamp()}] Response status: {response.status_code}")
-                
-                for line in response.iter_lines():
-                    if not line or line == "data: [DONE]":
-                        continue
-                    
-                    if line.startswith("data: "):
-                        try:
-                            chunk = json.loads(line[6:])
-                            if chunk.get("choices"):
-                                delta = chunk["choices"][0].get("delta", {})
-                                
-                                if delta.get("tool_calls"):
-                                    for tc in delta["tool_calls"]:
-                                        idx = tc.get("index", 0)
-                                        
-                                        if tc.get("id"):
-                                            tool_call_id = tc["id"]
-                                        
-                                        if tc.get("function", {}).get("name"):
-                                            tool_call_name = tc["function"]["name"]
-                                            print(f"[{timestamp()}] Tool call: {tool_call_name}")
-                                        
-                                        if tc.get("function", {}).get("arguments"):
-                                            accumulated_args += tc["function"]["arguments"]
-                                
-                                if delta.get("content"):
-                                    print(f"[{timestamp()}] Content: {delta['content'][:100]}")
-                                    
-                        except json.JSONDecodeError as e:
-                            print(f"[{timestamp()}] JSON error: {e}")
-            
-            if tool_call_name:
-                tool_calls.append({
-                    "id": tool_call_id or "call_0",
-                    "type": "function",
-                    "function": {
-                        "name": tool_call_name,
-                        "arguments": accumulated_args
-                    }
-                })
-        else:
-            # Non-streaming
-            response = client.post(
-                f"{API_BASE}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {API_KEY}",
-                    "Content-Type": "application/json"
-                },
-                json={
-                    "model": MODEL,
-                    "messages": messages,
-                    "tools": tools,
-                    "tool_choice": "auto",
-                    "stream": False,
-                    "max_tokens": 512,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-                }
-            )
-            
-            result = response.json()
-            print(f"[{timestamp()}] Response status: {response.status_code}")
-            
-            tool_calls = []
-            if result.get("choices"):
-                message = result["choices"][0].get("message", {})
-                if message.get("tool_calls"):
-                    tool_calls = message["tool_calls"]
-                    for tc in tool_calls:
-                        print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
-                        print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
-        
-        # Check if we got a tool call
-        if not tool_calls:
-            print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
-            return {"success": False, "reason": "no_tool_call"}
-        
-        # Step 2: Parse tool call and prepare response
-        tc = tool_calls[0]
-        tc_id = tc.get("id", "call_0")
-        tc_name = tc["function"]["name"]
-        tc_args = json.loads(tc["function"]["arguments"])
-        
-        print(f"\n[{timestamp()}] Step 2: Tool call received")
-        print(f"  Name: {tc_name}")
-        print(f"  Args: {tc_args}")
-        
-        # Simulate tool execution
-        tool_result = {
-            "location": tc_args.get("location", "Unknown"),
-            "temperature": "22°C",
-            "condition": "Partly cloudy",
-            "humidity": "65%",
-            "wind": "15 km/h NE"
-        }
-        
-        # Step 3: Send the tool response back
-        messages.append({
-            "role": "assistant",
-            "tool_calls": tool_calls
-        })
-        messages.append({
-            "role": "tool",
-            "tool_call_id": tc_id,
-            "content": json.dumps(tool_result)
-        })
-        
-        print(f"\n[{timestamp()}] Step 3: Sending tool response...")
-        print(f"  Tool call ID: {tc_id}")
-        print(f"  Tool result: {json.dumps(tool_result, indent=2)}")
-        
-        # Step 4: Get the model's follow-up response
-        if streaming:
-            final_response = ""
-            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
-            
-            with client.stream(
-                "POST",
-                f"{API_BASE}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {API_KEY}",
-                    "Content-Type": "application/json"
-                },
-                json={
-                    "model": MODEL,
-                    "messages": messages,
-                    "tools": tools,
-                    "stream": True,
-                    "max_tokens": 512,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-                }
-            ) as response:
-                for line in response.iter_lines():
-                    if not line or line == "data: [DONE]":
-                        continue
-                    
-                    if line.startswith("data: "):
-                        try:
-                            chunk = json.loads(line[6:])
-                            if chunk.get("choices"):
-                                delta = chunk["choices"][0].get("delta", {})
-                                if delta.get("content"):
-                                    content = delta["content"]
-                                    final_response += content
-                                    print(f"[{timestamp()}] Content: {content}", end="", flush=True)
-                        except json.JSONDecodeError:
-                            pass
-            
-            print()  # newline after streaming output
-        else:
-            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
-            
-            response = client.post(
-                f"{API_BASE}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {API_KEY}",
-                    "Content-Type": "application/json"
-                },
-                json={
-                    "model": MODEL,
-                    "messages": messages,
-                    "tools": tools,
-                    "stream": False,
-                    "max_tokens": 512,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-                }
-            )
-            
-            result = response.json()
-            final_response = ""
-            if result.get("choices"):
-                final_response = result["choices"][0].get("message", {}).get("content", "")
-        
-        print(f"\n[{timestamp()}] Final response:\n{final_response}")
-        
-        # Check if the model used the tool data
-        success = True
-        issues = []
-        
-        # The response should mention the weather data
-        if "22" not in final_response and "22°C" not in final_response:
-            issues.append("Temperature (22°C) not mentioned in response")
-            success = False
-        
-        if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
-            issues.append("Condition (Partly cloudy) not mentioned in response")
-            success = False
-        
-        # Check for signs the model didn't see the data
-        blank_indicators = [
-            "i don't have",
-            "i cannot access",
-            "i'm unable to",
-            "i am unable to",
-            "don't have access",
-            "don't have real-time",
-            "cannot provide real-time"
-        ]
-        
-        for indicator in blank_indicators:
-            if indicator in final_response.lower():
-                issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
-                success = False
-                break
-        
-        print(f"\n{'='*60}")
-        if success:
-            print("✓ PASS: Model correctly used tool response data")
-        else:
-            print("✗ FAIL: Model did not use tool response correctly")
-            for issue in issues:
-                print(f"  - {issue}")
-        print(f"{'='*60}\n")
-        
-        return {
-            "success": success,
-            "issues": issues,
-            "final_response": final_response
-        }
-
-
-def test_tool_response_with_debug_info():
-    """
-    Test with detailed logging to capture exactly what the model sees.
-    """
-    
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_time",
-                "description": "Get the current time",
-                "parameters": {
-                    "type": "object",
-                    "properties": {},
-                    "required": []
-                }
-            }
-        }
-    ]
-    
-    print(f"\n{'='*60}")
-    print(f"TEST: Tool response with debug info (non-streaming)")
-    print(f"{'='*60}\n")
-    
-    messages = [
-        {"role": "user", "content": "What time is it?"}
-    ]
-    
-    with httpx.Client(timeout=120.0) as client:
-        # Get tool call
-        print(f"[{timestamp()}] Sending initial request...")
-        response = client.post(
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "tool_choice": "auto",
-                "stream": False,
-                "max_tokens": 256,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        )
-        
-        result = response.json()
-        
-        if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
-            print("No tool call - skipping test")
-            return
-        
-        tool_call = result["choices"][0]["message"]["tool_calls"][0]
-        tc_id = tool_call["id"]
-        
-        print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
-        print(f"[{timestamp()}] Tool call ID: {tc_id}")
-        
-        # Add tool response
-        messages.append({
-            "role": "assistant",
-            "tool_calls": [tool_call]
-        })
-        messages.append({
-            "role": "tool",
-            "tool_call_id": tc_id,
-            "content": "The current time is 3:45 PM on Thursday, April 9, 2026."
-        })
-        
-        # Debug: print the full messages array we're about to send
-        print(f"\n[{timestamp()}] Sending follow-up with these messages:")
-        print(json.dumps(messages, indent=2))
-        
-        # Get follow-up
-        response2 = client.post(
-            f"{API_BASE}/chat/completions",
-            headers={
-                "Authorization": f"Bearer {API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json={
-                "model": MODEL,
-                "messages": messages,
-                "tools": tools,
-                "stream": False,
-                "max_tokens": 256,
-                "chat_template_kwargs": {"enable_thinking": False},
-                "logprobs": True,
-                "top_logprobs": 5
-            }
-        )
-        
-        result2 = response2.json()
-        print(f"\n[{timestamp()}] Full response:")
-        print(json.dumps(result2, indent=2))
-        
-        if result2.get("choices"):
-            content = result2["choices"][0].get("message", {}).get("content", "")
-            
-            print(f"\n[{timestamp()}] Model response content: {content}")
-            
-            # Check if time is mentioned
-            if "3:45" in content or "3:45 PM" in content:
-                print("\n✓ Model used the tool response (time mentioned)")
-            else:
-                print("\n✗ Model may not have seen the tool response (time not mentioned)")
-
-
-def main():
-    print("\n" + "="*60)
-    print("GLM-5.1 Tool Call Response Tests")
-    print("="*60)
-    
-    # Test non-streaming first (simpler to debug)
-    print("\n--- Test 1: Non-streaming tool response flow ---")
-    test_tool_call_response_flow(streaming=False)
-    
-    # Test streaming
-    print("\n--- Test 2: Streaming tool response flow ---")
-    test_tool_call_response_flow(streaming=True)
-    
-    # Debug test
-    print("\n--- Test 3: Debug info test ---")
-    test_tool_response_with_debug_info()
-    
-    print("\nAll tests complete.")
-
-
-if __name__ == "__main__":
-    main()