consolidate to run_suite.py: single pluggable test suite, all models 84/84

2026-04-12 21:59:03 +00:00
parent 2fa811b2e2
commit 1beaa23c58
7 changed files with 826 additions and 1661 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 .env
 models.env
 __pycache__/
--- a/run_suite.py
+++ b/run_suite.py
@@ -0,0 +1,815 @@
 #!/usr/bin/env python3
 """
 Universal model tool-call test suite.
 Tests any OpenAI-compatible endpoint for:
  1. Basic chat (non-streaming + streaming)
  2. Tool calls (non-streaming + streaming)
  3. Multi-turn tool response flow (non-streaming + streaming)
  4. Nested/bad tool schema handling (SGLang compatibility)
  5. Streaming tool call chunking (are args actually streamed?)
  6. Param sweep (what vLLM params does the endpoint accept?)
 Handles reasoning models (content in 'reasoning' field, null 'content'),
 different finish_reason values, and empty/tool_calls arrays gracefully.
 Usage:
  TOOLTEST_API_BASE=... TOOLTEST_API_KEY=... TOOLTEST_MODEL=... python3 run_suite.py
  python3 run_suite.py --all
  python3 run_suite.py --model 1
  python3 run_suite.py --filter Devstral
 """
 import os
 import sys
 import json
 import time
 import httpx
 import argparse
 from datetime import datetime
 from pathlib import Path
 from dataclasses import dataclass, field
 # ── Helpers ──────────────────────────────────────────────────
 def ts():
    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
 def safe_choice(body: dict, index: int = 0) -> dict:
    """Safely get a choice from a response body."""
    choices = body.get("choices") or []
    if index < len(choices):
        return choices[index]
    return {}
 def safe_message(body: dict) -> dict:
    """Safely get the message from the first choice."""
    return safe_choice(body).get("message") or {}
 def safe_delta(chunk: dict) -> dict:
    """Safely get the delta from the first choice of a streaming chunk."""
    choices = chunk.get("choices") or []
    if choices:
        return choices[0].get("delta") or {}
    return {}
 def extract_content(msg: dict) -> tuple[str, str]:
    """Extract (content, reasoning) from a message, handling nulls."""
    content = msg.get("content") or ""
    reasoning = msg.get("reasoning") or ""
    return content, reasoning
 # ── Config ───────────────────────────────────────────────────
@dataclass
 class ModelConfig:
    api_base: str
    api_key: str
    model: str
    @property
    def label(self):
        return self.model.split("/")[-1]
 def load_models_env(path: Path) -> list[ModelConfig]:
    """Load models from the models.env file (pipe-delimited)."""
    configs = []
    for line in path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = [p.strip() for p in line.split("|")]
        if len(parts) >= 3:
            configs.append(ModelConfig(api_base=parts[0], api_key=parts[1], model=parts[2]))
    return configs
 def config_from_env() -> ModelConfig | None:
    """Get a single config from TOOLTEST_* environment variables."""
    base = os.environ.get("TOOLTEST_API_BASE")
    key = os.environ.get("TOOLTEST_API_KEY")
    model = os.environ.get("TOOLTEST_MODEL")
    if base and key and model:
        return ModelConfig(api_base=base, api_key=key, model=model)
    return None
 # ── Test result types ────────────────────────────────────────
@dataclass
 class TestResult:
    name: str
    passed: bool
    detail: str = ""
    duration_s: float = 0.0
@dataclass
 class SuiteResult:
    model: str
    results: list[TestResult] = field(default_factory=list)
    @property
    def passed(self):
        return sum(1 for r in self.results if r.passed)
    @property
    def total(self):
        return len(self.results)
 def make_client(cfg: ModelConfig) -> httpx.Client:
    return httpx.Client(
        timeout=120.0,
        headers={
            "Authorization": f"Bearer {cfg.api_key}",
            "Content-Type": "application/json",
        },
    )
 # ── Shared tool definitions ──────────────────────────────────
 WEATHER_TOOL = {
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get the current weather for a location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
            },
            "required": ["location"]
        }
    }
 }
 WRITE_FILE_TOOL = {
    "type": "function",
    "function": {
        "name": "write_file",
        "description": "Write content to a file.",
        "parameters": {
            "type": "object",
            "properties": {
                "filename": {"type": "string", "description": "Name of the file"},
                "content": {"type": "string", "description": "The content to write"}
            },
            "required": ["filename", "content"]
        }
    }
 }
 BAD_SCHEMA_TOOL = {
    "type": "function",
    "function": {
        "name": "web_search",
        "description": "Search the web",
        "parameters": {
            "type": "object",
            "properties": []  # Invalid — should be {}
        }
    }
 }
 NESTED_BAD_SCHEMA_TOOL = {
    "type": "function",
    "function": {
        "name": "message",
        "description": "Send a message",
        "parameters": {
            "type": "object",
            "properties": {
                "fields": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": []  # Invalid — should be {}
                    }
                }
            }
        }
    }
 }
 # ── Test functions ───────────────────────────────────────────
 def test_basic_nonstream(cfg: ModelConfig) -> TestResult:
    """1. Basic non-streaming chat."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            r = c.post(f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "Say hello in one word."}],
                "stream": False,
                "max_tokens": 64,
            })
            body = r.json()
            dur = time.time() - start
            if r.status_code != 200:
                return TestResult("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
            content, reasoning = extract_content(safe_message(body))
            fr = safe_choice(body).get("finish_reason", "?")
            if content:
                return TestResult("basic non-stream", True, f"Got: {content[:80]}", dur)
            elif reasoning:
                return TestResult("basic non-stream", True, f"Reasoning-only (finish: {fr}): {reasoning[:80]}", dur)
            else:
                return TestResult("basic non-stream", False, f"Empty response (finish: {fr})", dur)
        except Exception as e:
            return TestResult("basic non-stream", False, f"Exception: {e}", time.time() - start)
 def test_basic_stream(cfg: ModelConfig) -> TestResult:
    """2. Basic streaming chat."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "Count from 1 to 5."}],
                "stream": True,
                "max_tokens": 64,
            }) as r:
                if r.status_code != 200:
                    body = "".join(r.iter_lines())
                    dur = time.time() - start
                    return TestResult("basic stream", False, f"HTTP {r.status_code}: {body[:200]}", dur)
                full_content = ""
                full_reasoning = ""
                for line in r.iter_lines():
                    if not line or line == "data: [DONE]":
                        continue
                    if line.startswith("data: "):
                        try:
                            chunk = json.loads(line[6:])
                            delta = safe_delta(chunk)
                            if delta.get("content"):
                                full_content += delta["content"]
                            if delta.get("reasoning"):
                                full_reasoning += delta["reasoning"]
                        except json.JSONDecodeError:
                            pass
                dur = time.time() - start
                if full_content:
                    return TestResult("basic stream", True, f"Got: {full_content[:80]}", dur)
                elif full_reasoning:
                    return TestResult("basic stream", True, f"Reasoning-only: {full_reasoning[:80]}", dur)
                else:
                    return TestResult("basic stream", False, "No content or reasoning received", dur)
        except Exception as e:
            return TestResult("basic stream", False, f"Exception: {e}", time.time() - start)
 def test_toolcall_nonstream(cfg: ModelConfig) -> TestResult:
    """3. Tool call — non-streaming."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            r = c.post(f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
                "tools": [WEATHER_TOOL],
                "tool_choice": "auto",
                "stream": False,
                "max_tokens": 256,
            })
            body = r.json()
            dur = time.time() - start
            if r.status_code != 200:
                return TestResult("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}", dur)
            msg = safe_message(body)
            tool_calls = msg.get("tool_calls") or []
            if tool_calls:
                tc = tool_calls[0]
                fn = tc.get("function", {})
                return TestResult("tool call non-stream", True,
                    f"Tool: {fn.get('name','?')}, args: {fn.get('arguments','')[:60]}", dur)
            else:
                content, reasoning = extract_content(msg)
                out = content or reasoning or "(empty)"
                return TestResult("tool call non-stream", False, f"No tool call. Response: {out[:100]}", dur)
        except Exception as e:
            return TestResult("tool call non-stream", False, f"Exception: {e}", time.time() - start)
 def test_toolcall_stream(cfg: ModelConfig) -> TestResult:
    """4. Tool call — streaming."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
                "tools": [WEATHER_TOOL],
                "tool_choice": "auto",
                "stream": True,
                "max_tokens": 256,
            }) as r:
                if r.status_code != 200:
                    body = "".join(r.iter_lines())
                    dur = time.time() - start
                    return TestResult("tool call stream", False, f"HTTP {r.status_code}", dur)
                tool_name = None
                accumulated_args = ""
                content_parts = ""
                reasoning_parts = ""
                for line in r.iter_lines():
                    if not line or line == "data: [DONE]":
                        continue
                    if line.startswith("data: "):
                        try:
                            chunk = json.loads(line[6:])
                            delta = safe_delta(chunk)
                            tc_list = delta.get("tool_calls") or []
                            for tc in tc_list:
                                fn = tc.get("function") or {}
                                if fn.get("name"):
                                    tool_name = fn["name"]
                                if fn.get("arguments"):
                                    accumulated_args += fn["arguments"]
                            if delta.get("content"):
                                content_parts += delta["content"]
                            if delta.get("reasoning"):
                                reasoning_parts += delta["reasoning"]
                        except json.JSONDecodeError:
                            pass
                dur = time.time() - start
                if tool_name:
                    return TestResult("tool call stream", True,
                        f"Tool: {tool_name}, args: {accumulated_args[:60]}", dur)
                else:
                    out = content_parts or reasoning_parts or "(empty)"
                    return TestResult("tool call stream", False, f"No tool call. Response: {out[:100]}", dur)
        except Exception as e:
            return TestResult("tool call stream", False, f"Exception: {e}", time.time() - start)
 def test_tool_response_flow(cfg: ModelConfig, streaming: bool = False) -> TestResult:
    """5/6. Full tool call → response → follow-up flow."""
    label = "tool response flow (stream)" if streaming else "tool response flow"
    with make_client(cfg) as c:
        start = time.time()
        try:
            messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
            # Step 1: Get tool call
            if not streaming:
                r = c.post(f"{cfg.api_base}/chat/completions", json={
                    "model": cfg.model,
                    "messages": messages,
                    "tools": [WEATHER_TOOL],
                    "tool_choice": "auto",
                    "stream": False,
                    "max_tokens": 256,
                })
                body = r.json()
                if r.status_code != 200:
                    return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
                msg = safe_message(body)
            else:
                tool_name = None
                tool_id = None
                accumulated_args = ""
                with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
                    "model": cfg.model,
                    "messages": messages,
                    "tools": [WEATHER_TOOL],
                    "tool_choice": "auto",
                    "stream": True,
                    "max_tokens": 256,
                }) as r:
                    if r.status_code != 200:
                        return TestResult(label, False, f"Step 1 HTTP {r.status_code}", time.time() - start)
                    for line in r.iter_lines():
                        if not line or line == "data: [DONE]":
                            continue
                        if line.startswith("data: "):
                            try:
                                chunk = json.loads(line[6:])
                                delta = safe_delta(chunk)
                                for tc in (delta.get("tool_calls") or []):
                                    if tc.get("id"):
                                        tool_id = tc["id"]
                                    fn = tc.get("function") or {}
                                    if fn.get("name"):
                                        tool_name = fn["name"]
                                    if fn.get("arguments"):
                                        accumulated_args += fn["arguments"]
                            except json.JSONDecodeError:
                                pass
                if not tool_name:
                    return TestResult(label, False, "No tool call in step 1", time.time() - start)
                msg = {
                    "role": "assistant",
                    "tool_calls": [{
                        "id": tool_id or "call_0",
                        "type": "function",
                        "function": {"name": tool_name, "arguments": accumulated_args}
                    }]
                }
            tool_calls = msg.get("tool_calls") or []
            if not tool_calls:
                return TestResult(label, False, "No tool call in step 1", time.time() - start)
            tc = tool_calls[0]
            tc_id = tc.get("id", "call_0")
            # Step 2: Send tool response
            messages.append(msg)
            messages.append({
                "role": "tool",
                "tool_call_id": tc_id,
                "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
            })
            # Step 3: Get follow-up
            r2 = c.post(f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": messages,
                "tools": [WEATHER_TOOL],
                "stream": False,
                "max_tokens": 256,
            })
            body2 = r2.json()
            dur = time.time() - start
            if r2.status_code != 200:
                return TestResult(label, False, f"Step 3 HTTP {r2.status_code}", dur)
            final_msg = safe_message(body2)
            final_content, final_reasoning = extract_content(final_msg)
            final = final_content or final_reasoning or ""
            # Check the model actually used the tool data
            ok = "22" in final
            indicators = ["i don't have", "i cannot access", "don't have access", "cannot provide real-time"]
            for ind in indicators:
                if ind in final.lower():
                    ok = False
                    break
            if not final_content and final_reasoning:
                return TestResult(label, ok, f"Reasoning-only (used data: {'yes' if ok else 'no'}) — {final[:100]}", dur)
            return TestResult(label, ok, f"{'Used' if ok else 'Did NOT use'} tool result — {final[:100]}", dur)
        except Exception as e:
            return TestResult(label, False, f"Exception: {e}", time.time() - start)
 def test_bad_tool_schema(cfg: ModelConfig) -> TestResult:
    """7. OpenClaw-style tool with properties=[] (tests schema validation/middleware)."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            r = c.post(f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "Search for cats"}],
                "tools": [BAD_SCHEMA_TOOL],
                "tool_choice": "auto",
                "stream": False,
                "max_tokens": 128,
            })
            body = r.json()
            dur = time.time() - start
            if r.status_code != 200:
                err = ""
                try:
                    err = body.get("error", {}).get("message", "")[:150]
                except Exception:
                    err = json.dumps(body)[:150]
                return TestResult("bad tool schema (properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
            return TestResult("bad tool schema (properties=[])", True, "Endpoint accepted/fixed bad schema", dur)
        except Exception as e:
            return TestResult("bad tool schema (properties=[])", False, f"Exception: {e}", time.time() - start)
 def test_nested_bad_schema(cfg: ModelConfig) -> TestResult:
    """8. Nested properties=[] inside items (the Tool 21 bug)."""
    with make_client(cfg) as c:
        start = time.time()
        try:
            r = c.post(f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{"role": "user", "content": "Send a message to Bob"}],
                "tools": [NESTED_BAD_SCHEMA_TOOL],
                "tool_choice": "auto",
                "stream": False,
                "max_tokens": 128,
            })
            body = r.json()
            dur = time.time() - start
            if r.status_code != 200:
                err = ""
                try:
                    err = body.get("error", {}).get("message", "")[:150]
                except Exception:
                    err = json.dumps(body)[:150]
                return TestResult("nested bad schema (items.properties=[])", False, f"HTTP {r.status_code}: {err}", dur)
            return TestResult("nested bad schema (items.properties=[])", True, "Endpoint accepted/fixed nested bad schema", dur)
        except Exception as e:
            return TestResult("nested bad schema (items.properties=[])", False, f"Exception: {e}", time.time() - start)
 def test_streaming_tool_chunks(cfg: ModelConfig) -> TestResult:
    """9. Streaming tool call chunking — are args actually streamed in multiple chunks?"""
    with make_client(cfg) as c:
        start = time.time()
        try:
            with c.stream("POST", f"{cfg.api_base}/chat/completions", json={
                "model": cfg.model,
                "messages": [{
                    "role": "user",
                    "content": "Write a Python hello world and save it using the write_file tool."
                }],
                "tools": [WRITE_FILE_TOOL],
                "tool_choice": "auto",
                "stream": True,
                "max_tokens": 1024,
            }) as r:
                if r.status_code != 200:
                    dur = time.time() - start
                    return TestResult("streaming tool chunking", False, f"HTTP {r.status_code}", dur)
                tool_name = None
                arg_chunks = 0
                accumulated_args = ""
                content_chunks = 0
                reasoning_chunks = 0
                for line in r.iter_lines():
                    if not line or line == "data: [DONE]":
                        continue
                    if line.startswith("data: "):
                        try:
                            chunk = json.loads(line[6:])
                            delta = safe_delta(chunk)
                            for tc in (delta.get("tool_calls") or []):
                                fn = tc.get("function") or {}
                                if fn.get("name"):
                                    tool_name = fn["name"]
                                if fn.get("arguments"):
                                    arg_chunks += 1
                                    accumulated_args += fn["arguments"]
                            if delta.get("content"):
                                content_chunks += 1
                            if delta.get("reasoning"):
                                reasoning_chunks += 1
                        except json.JSONDecodeError:
                            pass
                dur = time.time() - start
                if not tool_name:
                    if content_chunks > 0 or reasoning_chunks > 0:
                        return TestResult("streaming tool chunking", False,
                            f"No tool call — model produced {content_chunks} content + {reasoning_chunks} reasoning chunks", dur)
                    return TestResult("streaming tool chunking", False, "No tool call and no content", dur)
                # Evaluate chunking quality
                if arg_chunks > 1:
                    return TestResult("streaming tool chunking", True,
                        f"Args streamed in {arg_chunks} chunks ({len(accumulated_args)} chars)", dur)
                elif arg_chunks == 1 and len(accumulated_args) > 500:
                    return TestResult("streaming tool chunking", False,
                        f"Args in 1 chunk but {len(accumulated_args)} chars — buffered, not streamed", dur)
                elif arg_chunks == 1:
                    return TestResult("streaming tool chunking", True,
                        f"Args in 1 chunk ({len(accumulated_args)} chars — may be too short to stream)", dur)
                else:
                    return TestResult("streaming tool chunking", False, "Tool name only, no arg chunks", dur)
        except Exception as e:
            return TestResult("streaming tool chunking", False, f"Exception: {e}", time.time() - start)
 def test_param_sweep(cfg: ModelConfig) -> list[TestResult]:
    """10. Parameter sweep — which vLLM params does the endpoint accept?"""
    results = []
    base_req = {
        "model": cfg.model,
        "messages": [{"role": "user", "content": "Say hi."}],
        "stream": False,
        "max_tokens": 32,
    }
    extra_params = [
        ("chat_template_kwargs", {"enable_thinking": False}),
        ("guided_json", None),
        ("guided_regex", None),
        ("response_format", {"type": "json_object"}),
        ("n", 1),
        ("presence_penalty", 0.0),
        ("frequency_penalty", 0.0),
        ("top_p", 1.0),
        ("temperature", 0.7),
        ("seed", 42),
        ("stop", ["\n"]),
        ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
    ]
    with make_client(cfg) as c:
        for name, val in extra_params:
            start = time.time()
            try:
                if isinstance(val, dict):
                    req = {**base_req, **val}
                else:
                    req = {**base_req, name: val}
                r = c.post(f"{cfg.api_base}/chat/completions", json=req)
                dur = time.time() - start
                ok = r.status_code == 200
                detail = f"HTTP {r.status_code}"
                if not ok:
                    try:
                        detail += f": {r.json().get('error', {}).get('message', '')[:80]}"
                    except Exception:
                        pass
                results.append(TestResult(f"param: {name}", ok, detail, dur))
            except Exception as e:
                results.append(TestResult(f"param: {name}", False, f"Exception: {e}", time.time() - start))
    return results
 # ── Suite runner ─────────────────────────────────────────────
 ALL_TESTS = [
    test_basic_nonstream,
    test_basic_stream,
    test_toolcall_nonstream,
    test_toolcall_stream,
    lambda cfg: test_tool_response_flow(cfg, streaming=False),
    lambda cfg: test_tool_response_flow(cfg, streaming=True),
    test_bad_tool_schema,
    test_nested_bad_schema,
    test_streaming_tool_chunks,
 ]
 def run_suite(cfg: ModelConfig, verbose: bool = True) -> SuiteResult:
    """Run the full test suite against one model config."""
    result = SuiteResult(model=cfg.model)
    print(f"\n{'='*60}")
    print(f"Testing: {cfg.model}")
    print(f"API: {cfg.api_base}")
    print(f"{'='*60}")
    for test_fn in ALL_TESTS:
        name = (test_fn.__doc__ or "").strip().split("\n")[0] or test_fn.__name__
        if verbose:
            print(f"\n[{ts()}] Running: {name}...")
        tr = test_fn(cfg)
        if isinstance(tr, list):
            result.results.extend(tr)
        else:
            result.results.append(tr)
        if verbose:
            if isinstance(tr, list):
                for r in tr:
                    s = "✓" if r.passed else "✗"
                    print(f"  {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
            else:
                s = "✓" if tr.passed else "✗"
                print(f"  {s} {tr.name}: {tr.detail} ({tr.duration_s:.1f}s)")
    # Param sweep
    if verbose:
        print(f"\n[{ts()}] Running: parameter sweep...")
    sweep_results = test_param_sweep(cfg)
    result.results.extend(sweep_results)
    if verbose:
        for r in sweep_results:
            s = "✓" if r.passed else "✗"
            print(f"  {s} {r.name}: {r.detail} ({r.duration_s:.1f}s)")
    return result
 def print_summary(results: list[SuiteResult]):
    """Print a final summary across all models."""
    print(f"\n\n{'='*60}")
    print("FINAL SUMMARY")
    print(f"{'='*60}")
    for sr in results:
        passed = sr.passed
        total = sr.total
        pct = (passed / total * 100) if total else 0
        label = sr.model.split("/")[-1]
        print(f"\n  {label}: {passed}/{total} passed ({pct:.0f}%)")
        for r in sr.results:
            if not r.passed:
                print(f"    ✗ {r.name}: {r.detail[:80]}")
    # Cross-model comparison for key tests
    print(f"\n{'─'*60}")
    print("CROSS-MODEL COMPARISON")
    print(f"{'─'*60}")
    key_tests = [
        "basic non-stream",
        "basic stream",
        "tool call non-stream",
        "tool call stream",
        "tool response flow",
        "tool response flow (stream)",
        "streaming tool chunking",
        "bad tool schema (properties=[])",
        "nested bad schema (items.properties=[])",
    ]
    # Calculate column width
    labels = [sr.model.split("/")[-1][:18] for sr in results]
    col_w = max(len(l) for l in labels) if labels else 16
    col_w = max(col_w, 16)
    header = f"{'Test':<40}"
    for l in labels:
        header += f" {l:>{col_w}}"
    print(header)
    print("─" * len(header))
    for test_name in key_tests:
        row = f"{test_name:<40}"
        for sr in results:
            match = [r for r in sr.results if r.name == test_name]
            if match:
                status = "✓" if match[0].passed else "✗"
                row += f" {status:>{col_w}}"
            else:
                row += f" {'—':>{col_w}}"
        print(row)
    print(f"\n{'='*60}")
 # ── CLI ──────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(description="Universal model tool-call test suite")
    parser.add_argument("--all", action="store_true", help="Test all models from models.env")
    parser.add_argument("--model", type=int, help="Test model by 1-based index from models.env")
    parser.add_argument("--filter", type=str, help="Test models matching substring")
    parser.add_argument("--quiet", action="store_true", help="Less output per test")
    args = parser.parse_args()
    models_path = Path(__file__).parent / "models.env"
    configs: list[ModelConfig] = []
    if args.all:
        if not models_path.exists():
            print("ERROR: models.env not found")
            sys.exit(1)
        configs = load_models_env(models_path)
    elif args.model:
        if not models_path.exists():
            print("ERROR: models.env not found")
            sys.exit(1)
        all_configs = load_models_env(models_path)
        if args.model < 1 or args.model > len(all_configs):
            print(f"ERROR: --model index {args.model} out of range (1-{len(all_configs)})")
            sys.exit(1)
        configs = [all_configs[args.model - 1]]
    elif args.filter:
        if not models_path.exists():
            print("ERROR: models.env not found")
            sys.exit(1)
        all_configs = load_models_env(models_path)
        configs = [c for c in all_configs if args.filter.lower() in c.model.lower()]
        if not configs:
            print(f"No models matching '{args.filter}'")
            sys.exit(1)
    else:
        cfg = config_from_env()
        if cfg:
            configs = [cfg]
        else:
            print("No model specified. Use --all, --model N, --filter NAME, or set TOOLTEST_* env vars.")
            if models_path.exists():
                print("\nAvailable models from models.env:")
                for i, c in enumerate(load_models_env(models_path), 1):
                    print(f"  {i}. {c.model} @ {c.api_base}")
            sys.exit(1)
    all_results: list[SuiteResult] = []
    for cfg in configs:
        sr = run_suite(cfg, verbose=not args.quiet)
        all_results.append(sr)
    print_summary(all_results)
    if any(sr.passed < sr.total for sr in all_results):
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,19 +1,14 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Run the streaming tool call tests
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# Default values
+# Usage:
-export VLLM_API_BASE="${VLLM_API_BASE:-http://95.179.247.150/v1}"
+#   ./run_tests.sh                # Test all models from models.env
-export VLLM_API_KEY="${VLLM_API_KEY:-none}"
+#   ./run_tests.sh --model 1      # Test model #1
-export VLLM_MODEL="${VLLM_MODEL:-HuggingFaceTB/SmolLM3-3B}"
+#   ./run_tests.sh --filter Devstral  # Test matching models
 #   ./run_tests.sh --all          # Same as no args
 #   ./run_tests.sh --quiet        # Less output
-echo "Configuration:"
+cd "$SCRIPT_DIR"
-echo "  API_BASE: $VLLM_API_BASE"
+python3 -u run_suite.py "$@"
 echo "  MODEL: $VLLM_MODEL"
 echo ""
 # Run the test
 python3 "$SCRIPT_DIR/test_streaming_tool_calls.py"
--- a/test_devstral.py
+++ b/test_devstral.py
@@ -1,546 +0,0 @@
 #!/usr/bin/env python3
 """
 Test suite for mistralai/Devstral-2-123B-Instruct-2512 via SGLang middleware.
 These tests send EXACTLY what OpenClaw would send to vLLM — including
 chat_template_kwargs, logprobs, weird tool schemas, the works.
 The middleware's job is to strip/fix all of it so SGLang doesn't choke.
 Architecture:  this test → middleware (strips bad params) → SGLang
 """
 import os
 import time
 import json
 import httpx
 from datetime import datetime
 from pathlib import Path
 # Load .env if present (don't hardcode keys)
 _env_file = Path(__file__).parent / ".env"
 if _env_file.exists():
    for line in _env_file.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        os.environ.setdefault(k.strip(), v.strip())
 API_BASE = os.environ.get("DEVSTRAL_API_BASE", "http://127.0.0.1:8002/v1")
 API_KEY = os.environ.get("DEVSTRAL_API_KEY", "whatever")
 MODEL = os.environ.get("DEVSTRAL_MODEL", "mistralai/Devstral-2-123B-Instruct-2512")
 RESULTS = []
 def ts():
    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
 def record(name, ok, detail=""):
    status = "✓ PASS" if ok else "✗ FAIL"
    print(f"\n{status}: {name}")
    if detail:
        print(f"  {detail}")
    RESULTS.append({"name": name, "pass": ok, "detail": detail})
 def make_client():
    return httpx.Client(
        timeout=120.0,
        headers={
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json",
        },
    )
 # ── 1. Basic non-streaming chat ──────────────────────────────
 def test_basic_nonstream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Basic non-streaming chat")
    print(f"{'='*60}")
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Say hello in one word."}],
            "stream": False,
            "max_tokens": 32,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
            record("basic non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        content = body["choices"][0]["message"]["content"]
        print(f"[{ts()}] Reply: {content[:100]}")
        record("basic non-stream", True, f"Got: {content[:80]}")
 # ── 2. Basic streaming chat ──────────────────────────────────
 def test_basic_stream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Basic streaming chat")
    print(f"{'='*60}")
    with make_client() as c:
        with c.stream("POST", f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Count from 1 to 5."}],
            "stream": True,
            "max_tokens": 64,
        }) as r:
            print(f"[{ts()}] Status: {r.status_code}")
            if r.status_code != 200:
                body = "".join(r.iter_lines())
                print(f"[{ts()}] Error: {body[:300]}")
                record("basic stream", False, f"HTTP {r.status_code}")
                return
            full = ""
            for line in r.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    try:
                        chunk = json.loads(line[6:])
                        if not chunk.get("choices"): continue
                        delta = chunk["choices"][0].get("delta", {})
                        if delta.get("content"):
                            full += delta["content"]
                    except json.JSONDecodeError:
                        pass
            print(f"[{ts()}] Reply: {full[:100]}")
            record("basic stream", True, f"Got: {full[:80]}")
 # ── 3. Tool call — non-streaming (vLLM-style tool schema) ───
 def test_toolcall_nonstream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Tool call non-streaming (vLLM-style)")
    print(f"{'='*60}")
    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)}")
            record("tool call non-stream", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        msg = body["choices"][0]["message"]
        if msg.get("tool_calls"):
            tc = msg["tool_calls"][0]
            print(f"[{ts()}] Tool: {tc['function']['name']}, args: {tc['function']['arguments']}")
            record("tool call non-stream", True, f"Got tool call: {tc['function']['name']}")
        else:
            content = msg.get("content", "")
            print(f"[{ts()}] No tool call. Content: {content[:200]}")
            record("tool call non-stream", False, "Model did not call the tool")
 # ── 4. Tool call — streaming ────────────────────────────────
 def test_toolcall_stream():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Tool call streaming")
    print(f"{'='*60}")
    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]
    with make_client() as c:
        with c.stream("POST", f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": True,
            "max_tokens": 256,
        }) as r:
            print(f"[{ts()}] Status: {r.status_code}")
            if r.status_code != 200:
                body = "".join(r.iter_lines())
                print(f"[{ts()}] Error: {body[:300]}")
                record("tool call stream", False, f"HTTP {r.status_code}")
                return
            tool_name = None
            accumulated_args = ""
            content_parts = ""
            for line in r.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    try:
                        chunk = json.loads(line[6:])
                        if not chunk.get("choices"): continue
                        delta = chunk["choices"][0].get("delta", {})
                        if delta.get("tool_calls"):
                            for tc in delta["tool_calls"]:
                                if tc.get("function", {}).get("name"):
                                    tool_name = tc["function"]["name"]
                                if tc.get("function", {}).get("arguments"):
                                    accumulated_args += tc["function"]["arguments"]
                        if delta.get("content"):
                            content_parts += delta["content"]
                    except json.JSONDecodeError:
                        pass
            if tool_name:
                print(f"[{ts()}] Tool: {tool_name}, args: {accumulated_args}")
                record("tool call stream", True, f"Got tool call: {tool_name}")
            else:
                print(f"[{ts()}] No tool call. Content: {content_parts[:200]}")
                record("tool call stream", False, "Model did not call the tool")
 # ── 5. Full tool response flow (non-streaming) ──────────────
 def test_tool_response_flow():
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Full tool response flow (non-streaming)")
    print(f"{'='*60}")
    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City, e.g. 'Tokyo'"}
                },
                "required": ["location"]
            }
        }
    }]
    messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": messages,
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
        })
        body = r.json()
        if r.status_code != 200:
            record("tool response flow", False, f"Step 1 failed: HTTP {r.status_code}")
            return
        msg = body["choices"][0]["message"]
        if not msg.get("tool_calls"):
            record("tool response flow", False, "No tool call in step 1")
            return
        tc = msg["tool_calls"][0]
        tc_id = tc["id"]
        print(f"[{ts()}] Tool call: {tc['function']['name']} (id={tc_id})")
        messages.append(msg)
        messages.append({
            "role": "tool",
            "tool_call_id": tc_id,
            "content": json.dumps({"location": "Tokyo", "temperature": "22°C", "condition": "Partly cloudy"}),
        })
        r2 = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": messages,
            "tools": tools,
            "stream": False,
            "max_tokens": 256,
        })
        body2 = r2.json()
        if r2.status_code != 200:
            print(f"[{ts()}] Step 2 error: {json.dumps(body2, indent=2)}")
            record("tool response flow", False, f"Step 2 failed: HTTP {r2.status_code}")
            return
        final = body2["choices"][0]["message"].get("content", "")
        print(f"[{ts()}] Final: {final[:200]}")
        ok = "22" in final
        record("tool response flow", ok, f"Model used tool result: {'yes' if ok else 'no'} — {final[:100]}")
 # ── 6. Param sweep — everything OpenClaw/vLLM sends ─────────
 def test_param_sweep():
    """
    Sends EVERY param that OpenClaw or vLLM might include.
    The middleware must strip/fix the ones SGLang rejects.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Parameter sweep (vLLM-compat, middleware must fix)")
    print(f"{'='*60}")
    base_req = {
        "model": MODEL,
        "messages": [{"role": "user", "content": "Say hi."}],
        "stream": False,
        "max_tokens": 32,
    }
    # Params that OpenClaw/vLLM might send — some SGLang rejects
    extra_params = [
        ("chat_template_kwargs", {"enable_thinking": False}),
        ("guided_json", None),
        ("guided_regex", None),
        ("response_format", {"type": "json_object"}),
        ("n", 1),
        ("presence_penalty", 0.0),
        ("frequency_penalty", 0.0),
        ("top_p", 1.0),
        ("temperature", 0.7),
        ("seed", 42),
        ("stop", ["\n"]),
        ("logprobs+top_logprobs", {"logprobs": True, "top_logprobs": 5}),
        ("top_logprobs", 5),
    ]
    with make_client() as c:
        # baseline
        r = c.post(f"{API_BASE}/chat/completions", json=base_req)
        print(f"[{ts()}] Baseline: {r.status_code}")
        for name, val in extra_params:
            req = {**base_req, name: val}
            r = c.post(f"{API_BASE}/chat/completions", json=req)
            status = "✓" if r.status_code == 200 else "✗"
            detail = ""
            if r.status_code != 200:
                try:
                    detail = r.json().get("error", {}).get("message", "")[:100]
                except Exception:
                    detail = r.text[:100]
            print(f"[{ts()}] {status} {name}={val!r} → HTTP {r.status_code} {detail}")
            if r.status_code != 200:
                record(f"param sweep: {name}", False, f"HTTP {r.status_code} with {name}={val!r}: {detail}")
 # ── 7. OpenClaw-style tool schema (the one that caused 400) ─
 def test_openclaw_tool_schema():
    """
    Reproduce the exact tool schema that OpenClaw sends which has
    parameters.properties = [] instead of {}. Middleware must fix it.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: OpenClaw-style tool schema (bad properties)")
    print(f"{'='*60}")
    # This is the exact shape OpenClaw sends for tools with no params
    tools = [{
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web",
            "parameters": {
                "type": "object",
                "properties": []  # <-- THIS is what causes the 400
            }
        }
    }]
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Search for cats"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 128,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
            record("openclaw tool schema", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        print(f"[{ts()}] Success — middleware fixed the bad schema")
        record("openclaw tool schema", True, "Middleware fixed parameters.properties=[] → {}")
 # ── 8. Nested properties=[] in tool schema (Tool 21 bug) ────
 def test_nested_bad_properties():
    """
    Reproduce the exact Tool 21 400 error:
    schema['properties']['fields']['items']['properties'] = []
    This happens when a tool has an array-of-objects parameter where
    the items' properties field is [] instead of {}. The middleware
    must recurse into the schema to fix ALL properties fields.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: Nested properties=[] in tool schema (Tool 21 bug)")
    print(f"{'='*60}")
    # This is the exact shape that causes: "Tool 21 function has invalid 'parameters' schema:
    # [] is not of type 'object' ... On schema['properties']['fields']['items']['properties']"
    tools = [{
        "type": "function",
        "function": {
            "name": "message",
            "description": "Send a message",
            "parameters": {
                "type": "object",
                "properties": {
                    "fields": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": []  # <-- THIS causes the 400
                        }
                    }
                }
            }
        }
    }]
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Send a message to Bob"}],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 128,
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:500]}")
            record("nested bad properties", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        print(f"[{ts()}] Success — middleware fixed nested properties=[] to {{}}")
        record("nested bad properties", True, "Middleware fixed nested properties.properties=[] to {}")
 # ── 9. OpenClaw full payload (chat_template_kwargs + tools) ─
 def test_openclaw_full_payload():
    """
    The kitchen sink: chat_template_kwargs + logprobs + tools with bad schemas.
    Exactly what OpenClaw sends through the pipe.
    """
    print(f"\n{'='*60}")
    print(f"[{ts()}] TEST: OpenClaw full payload (kitchen sink)")
    print(f"{'='*60}")
    tools = [{
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web using DuckDuckGo.",
            "parameters": {
                "type": "object",
                "properties": []  # Bad — middleware must fix
            }
        }
    }]
    with make_client() as c:
        r = c.post(f"{API_BASE}/chat/completions", json={
            "model": MODEL,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Search for the weather in NYC"},
            ],
            "tools": tools,
            "tool_choice": "auto",
            "stream": False,
            "max_tokens": 256,
            "chat_template_kwargs": {"enable_thinking": False},  # Bad — middleware must strip
            "logprobs": True,                                      # Bad — middleware must strip
            "top_logprobs": 5,                                     # Bad — middleware must strip
        })
        print(f"[{ts()}] Status: {r.status_code}")
        body = r.json()
        if r.status_code != 200:
            print(f"[{ts()}] Error: {json.dumps(body, indent=2)[:300]}")
            record("openclaw full payload", False, f"HTTP {r.status_code}: {json.dumps(body)[:200]}")
            return
        msg = body["choices"][0]["message"]
        print(f"[{ts()}] Success — middleware cleaned everything")
        if msg.get("tool_calls"):
            tc = msg["tool_calls"][0]
            print(f"[{ts()}] Tool call: {tc['function']['name']}")
        else:
            print(f"[{ts()}] No tool call, content: {msg.get('content', '')[:100]}")
        record("openclaw full payload", True, "Full OpenClaw payload survived the middleware")
 # ── Main ─────────────────────────────────────────────────────
 def main():
    print(f"\n{'='*60}")
    print(f"Devstral-2-123B Test Suite (vLLM-compat, via middleware)")
    print(f"API: {API_BASE}")
    print(f"Model: {MODEL}")
    print(f"{'='*60}")
    test_basic_nonstream()
    test_basic_stream()
    test_toolcall_nonstream()
    test_toolcall_stream()
    test_tool_response_flow()
    test_param_sweep()
    test_openclaw_tool_schema()
    test_nested_bad_properties()
    test_openclaw_full_payload()
    print(f"\n\n{'='*60}")
    print("FINAL RESULTS")
    print(f"{'='*60}")
    for r in RESULTS:
        s = "✓" if r["pass"] else "✗"
        print(f"  {s} {r['name']}: {r['detail']}")
    passed = sum(1 for r in RESULTS if r["pass"])
    print(f"\n  {passed}/{len(RESULTS)} passed")
    print(f"{'='*60}")
 if __name__ == "__main__":
    main()
--- a/test_streaming_tool_calls.py
+++ b/test_streaming_tool_calls.py
@@ -1,395 +0,0 @@
 #!/usr/bin/env python3
 """
 Test suite for vLLM GLM-5.1 streaming tool calls.
 Reproduces the issue where long string parameters in tool calls
 are buffered entirely before being emitted during streaming.
 """
 import os
 import time
 import json
 import httpx
 from datetime import datetime
 # Configuration - will be set via environment or direct assignment
 API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
 API_KEY = os.environ.get("VLLM_API_KEY", "none")
 MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
 def timestamp():
    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
 def test_streaming_tool_call_with_code():
    """
    Test streaming a tool call with a long string parameter.
    This prompts the model to generate code via a tool call,
    which should stream incrementally if the patch works correctly.
    """
    tools = [
        {
            "type": "function",
            "function": {
                "name": "write_file",
                "description": "Write content to a file. Use this to save code, text, or other content.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "filename": {
                            "type": "string",
                            "description": "Name of the file to write"
                        },
                        "content": {
                            "type": "string",
                            "description": "The content to write to the file"
                        }
                    },
                    "required": ["filename", "content"]
                }
            }
        }
    ]
    messages = [
        {
            "role": "user",
            "content": "Write a Python implementation of a binary search tree with insert, search, and delete methods. Include docstrings and type hints. Save it to bst.py using the write_file tool."
        }
    ]
    print(f"\n{'='*60}")
    print(f"TEST: Streaming tool call with long string parameter")
    print(f"API: {API_BASE}")
    print(f"Model: {MODEL}")
    print(f"{'='*60}\n")
    # Track streaming events
    chunks_received = []
    first_chunk_time = None
    last_chunk_time = None
    tool_call_chunks = []
    accumulated_content = ""
    start_time = time.time()
    with httpx.Client(timeout=120.0) as client:
        with client.stream(
            "POST",
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "tool_choice": "auto",
                "stream": True,
                "max_tokens": 4096,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        ) as response:
            print(f"[{timestamp()}] Response status: {response.status_code}")
            for line in response.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    chunk_data = line[6:]
                    try:
                        chunk = json.loads(chunk_data)
                        if first_chunk_time is None:
                            first_chunk_time = time.time()
                            print(f"\n[{timestamp()}] FIRST CHUNK RECEIVED ({first_chunk_time - start_time:.3f}s)")
                        last_chunk_time = time.time()
                        chunks_received.append(chunk)
                        # Extract delta content
                        if chunk.get("choices"):
                            delta = chunk["choices"][0].get("delta", {})
                            # Check for tool calls in delta
                            if delta.get("tool_calls"):
                                for tc in delta["tool_calls"]:
                                    tc_index = tc.get("index", 0)
                                    tc_function = tc.get("function", {})
                                    if tc_function.get("name"):
                                        print(f"\n[{timestamp()}] Tool call name: {tc_function['name']}")
                                    if tc_function.get("arguments"):
                                        args_chunk = tc_function["arguments"]
                                        tool_call_chunks.append(args_chunk)
                                        accumulated_content += args_chunk
                                        # Print progress every ~500 chars
                                        if len(accumulated_content) % 500 < len(args_chunk):
                                            print(f"[{timestamp()}] Accumulated {len(accumulated_content)} chars...")
                            # Regular content
                            if delta.get("content"):
                                print(f"[{timestamp()}] Content chunk: {delta['content'][:50]}...")
                    except json.JSONDecodeError as e:
                        print(f"[{timestamp()}] JSON decode error: {e}")
    end_time = time.time()
    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Total chunks received: {len(chunks_received)}")
    print(f"Total time: {end_time - start_time:.3f}s")
    if first_chunk_time:
        print(f"Time to first chunk: {first_chunk_time - start_time:.3f}s")
    if tool_call_chunks:
        print(f"Tool call chunks: {len(tool_call_chunks)}")
        print(f"Total tool call content: {len(accumulated_content)} chars")
        # Try to parse the accumulated arguments
        print(f"\nAttempting to parse tool call arguments...")
        try:
            args = json.loads(accumulated_content)
            print(f"Successfully parsed!")
            print(f"  - filename: {args.get('filename', 'N/A')}")
            print(f"  - content length: {len(args.get('content', ''))} chars")
        except json.JSONDecodeError as e:
            print(f"Failed to parse: {e}")
            print(f"Raw accumulated content (first 500 chars):\n{accumulated_content[:500]}")
    # Verdict
    print(f"\n{'='*60}")
    if len(tool_call_chunks) > 1:
        print("✓ PASS: Tool call arguments arrived in multiple chunks")
        print(f"  Chunks: {len(tool_call_chunks)}, indicating incremental streaming")
    elif len(tool_call_chunks) == 1 and len(accumulated_content) > 1000:
        print("✗ FAIL: Tool call arguments arrived in a single chunk")
        print("  This indicates buffering, not true streaming")
    else:
        print("? INCONCLUSIVE: Not enough data or no tool call occurred")
    print(f"{'='*60}\n")
    return {
        "chunks_received": len(chunks_received),
        "tool_call_chunks": len(tool_call_chunks),
        "accumulated_length": len(accumulated_content),
        "total_time": end_time - start_time
    }
 def test_streaming_tool_call_with_json():
    """
    Test streaming a tool call that returns structured JSON data.
    """
    tools = [
        {
            "type": "function",
            "function": {
                "name": "save_config",
                "description": "Save a configuration object",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "config": {
                            "type": "object",
                            "description": "Configuration object with many fields"
                        }
                    },
                    "required": ["config"]
                }
            }
        }
    ]
    messages = [
        {
            "role": "user",
            "content": "Create a detailed configuration for a web server with the following sections: server (host, port, ssl), logging (level, format, outputs), cache (enabled, ttl, max_size), rate_limiting (enabled, requests_per_minute, burst), cors (enabled, origins, methods, headers), security (headers, csp, hsts). Use the save_config tool."
        }
    ]
    print(f"\n{'='*60}")
    print(f"TEST: Streaming tool call with nested JSON")
    print(f"{'='*60}\n")
    tool_call_chunks = []
    accumulated_content = ""
    start_time = time.time()
    with httpx.Client(timeout=120.0) as client:
        with client.stream(
            "POST",
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "tool_choice": "auto",
                "stream": True,
                "max_tokens": 2048,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        ) as response:
            for line in response.iter_lines():
                if not line or line == "data: [DONE]":
                    continue
                if line.startswith("data: "):
                    try:
                        chunk = json.loads(line[6:])
                        if chunk.get("choices"):
                            delta = chunk["choices"][0].get("delta", {})
                            if delta.get("tool_calls"):
                                for tc in delta["tool_calls"]:
                                    if tc.get("function", {}).get("arguments"):
                                        args_chunk = tc["function"]["arguments"]
                                        tool_call_chunks.append(args_chunk)
                                        accumulated_content += args_chunk
                                        print(f"[{timestamp()}] Chunk {len(tool_call_chunks)}: +{len(args_chunk)} chars (total: {len(accumulated_content)})")
                    except json.JSONDecodeError:
                        pass
    end_time = time.time()
    print(f"\n{'='*60}")
    print(f"Total chunks: {len(tool_call_chunks)}, Total content: {len(accumulated_content)} chars")
    print(f"Time: {end_time - start_time:.3f}s")
    if len(tool_call_chunks) > 1:
        print("✓ PASS: Arguments streamed in multiple chunks")
    elif len(tool_call_chunks) == 1:
        print("✗ FAIL: Arguments arrived in single chunk (buffered)")
    else:
        print("? No tool call occurred")
    print(f"{'='*60}\n")
 def test_non_streaming_tool_call():
    """
    Baseline test: non-streaming tool call for comparison.
    """
    tools = [
        {
            "type": "function",
            "function": {
                "name": "write_file",
                "description": "Write content to a file",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "filename": {"type": "string"},
                        "content": {"type": "string"}
                    },
                    "required": ["filename", "content"]
                }
            }
        }
    ]
    messages = [
        {
            "role": "user",
            "content": "Write a simple Python hello world and save it using the write_file tool."
        }
    ]
    print(f"\n{'='*60}")
    print(f"TEST: Non-streaming tool call (baseline)")
    print(f"{'='*60}\n")
    start_time = time.time()
    with httpx.Client(timeout=120.0) as client:
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "tool_choice": "auto",
                "stream": False,
                "max_tokens": 1024,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        )
        result = response.json()
        end_time = time.time()
        print(f"Status: {response.status_code}")
        print(f"Time: {end_time - start_time:.3f}s")
        if result.get("choices"):
            message = result["choices"][0].get("message", {})
            if message.get("tool_calls"):
                for tc in message["tool_calls"]:
                    print(f"Tool: {tc['function']['name']}")
                    args = json.loads(tc["function"]["arguments"])
                    print(f"Arguments parsed successfully")
                    print(f"  - filename: {args.get('filename')}")
                    print(f"  - content length: {len(args.get('content', ''))}")
            else:
                print("No tool call in response")
    print(f"{'='*60}\n")
 def main():
    print("\n" + "="*60)
    print("vLLM GLM-5.1 Streaming Tool Call Tests")
    print("="*60)
    # Check API connectivity
    print(f"\nChecking API at {API_BASE}...")
    try:
        with httpx.Client(timeout=10.0) as client:
            response = client.get(f"{API_BASE.replace('/v1', '')}/health")
            print(f"Health check: {response.status_code}")
    except Exception as e:
        print(f"Warning: Could not reach API - {e}")
    # Run tests
    print("\nRunning tests...\n")
    # Test 1: Non-streaming baseline
    test_non_streaming_tool_call()
    # Test 2: Streaming with nested JSON
    test_streaming_tool_call_with_json()
    # Test 3: Main test - streaming with long code
    result = test_streaming_tool_call_with_code()
    print("\nAll tests complete.")
 if __name__ == "__main__":
    main()
--- a/test_tool_diagnosis.py
+++ b/test_tool_diagnosis.py
@@ -1,243 +0,0 @@
 #!/usr/bin/env python3
 """
 Focused test to diagnose GLM-5.1 tool response issue.
 The issue: Model sees tool response as blank.
 """
 import httpx
 import json
 API_BASE = "http://95.179.247.150/v1"
 API_KEY = "whatever"
 MODEL = "HuggingFaceTB/SmolLM3-3B"
 def test_simple_tool_response():
    """
    Minimal test: Send a tool response and see if the model can use it.
    """
    # Simulate a conversation where a tool was called
    messages = [
        {"role": "user", "content": "Call the test function"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "test_func", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "SUCCESS: The function returned value 42"
        }
    ]
    tools = [{
        "type": "function",
        "function": {
            "name": "test_func",
            "description": "A test function",
            "parameters": {"type": "object", "properties": {}}
        }
    }]
    print("=" * 60)
    print("Request messages:")
    print(json.dumps(messages, indent=2))
    print("=" * 60)
    with httpx.Client(timeout=60.0) as client:
        # Non-streaming to get full response
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "stream": False,
                "max_tokens": 256,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        )
        result = response.json()
        print("\nFull response:")
        print(json.dumps(result, indent=2))
        if result.get("choices"):
            content = result["choices"][0].get("message", {}).get("content", "")
            print("\n" + "=" * 60)
            print("Model response content:")
            print(content)
            print("=" * 60)
            # Check if the tool result is referenced
            if "42" in content:
                print("\n✓ PASS: Model referenced the tool result (42)")
            else:
                print("\n✗ FAIL: Model did NOT reference the tool result (42)")
            # Check for signs the model didn't see the result
            if "don't have" in content.lower() or "cannot access" in content.lower():
                print("✗ Model indicates it cannot see tool result")
 def test_without_tools_param():
    """
    Test what happens if we don't pass tools in the follow-up request.
    Some APIs need tools to be passed on every request.
    """
    messages = [
        {"role": "user", "content": "Call the test function"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "test_func", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "SUCCESS: The function returned value 42"
        }
    ]
    print("\n" + "=" * 60)
    print("Test WITHOUT tools param in follow-up")
    print("=" * 60)
    with httpx.Client(timeout=60.0) as client:
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                # No tools param
                "stream": False,
                "max_tokens": 256,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        )
        result = response.json()
        if result.get("choices"):
            content = result["choices"][0].get("message", {}).get("content", "")
            print("Model response:", content[:200])
            if "42" in content:
                print("✓ Model referenced the tool result")
 def test_different_content_formats():
    """
    Test if the issue is with how content is formatted.
    """
    # Test 1: String content (standard)
    messages_string = [
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "calc", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": "The answer is 4"
        }
    ]
    # Test 2: Content as array (OpenAI format)
    messages_array = [
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "tool_calls": [{
                "id": "call_123",
                "type": "function",
                "function": {"name": "calc", "arguments": "{}"}
            }]
        },
        {
            "role": "tool",
            "tool_call_id": "call_123",
            "content": [{"type": "text", "text": "The answer is 4"}]
        }
    ]
    tools = [{
        "type": "function",
        "function": {
            "name": "calc",
            "description": "Calculator",
            "parameters": {"type": "object", "properties": {}}
        }
    }]
    print("\n" + "=" * 60)
    print("Test: String content vs Array content")
    print("=" * 60)
    with httpx.Client(timeout=60.0) as client:
        for name, msgs in [("String content", messages_string), ("Array content", messages_array)]:
            print(f"\n--- {name} ---")
            response = client.post(
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": msgs,
                    "tools": tools,
                    "stream": False,
                    "max_tokens": 128,
                    "chat_template_kwargs": {"enable_thinking": False},
                    "logprobs": True,
                    "top_logprobs": 5
                }
            )
            result = response.json()
            if result.get("choices"):
                content = result["choices"][0].get("message", {}).get("content", "")
                print(f"Response: {content[:150]}")
                if "4" in content:
                    print("✓ Referenced tool result")
                else:
                    print("✗ Did NOT reference tool result")
 if __name__ == "__main__":
    print("GLM-5.1 Tool Response Diagnosis")
    print("=" * 60)
    test_simple_tool_response()
    test_without_tools_param()
    test_different_content_formats()
--- a/test_tool_response.py
+++ b/test_tool_response.py
@@ -1,463 +0,0 @@
 #!/usr/bin/env python3
 """
 Test for tool call response handling in GLM-5.1.
 Tests the multi-turn flow:
 1. Send a prompt that triggers a tool call
 2. Send back the tool result
 3. Verify the model can see and use the tool response
 This reproduces the issue where tool responses appear blank to the model.
 """
 import os
 import json
 import httpx
 from datetime import datetime
 API_BASE = os.environ.get("VLLM_API_BASE", "http://95.179.247.150/v1")
 API_KEY = os.environ.get("VLLM_API_KEY", "none")
 MODEL = os.environ.get("VLLM_MODEL", "HuggingFaceTB/SmolLM3-3B")
 def timestamp():
    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
 def test_tool_call_response_flow(streaming: bool = True):
    """
    Test the full tool call -> response -> follow-up flow.
    This simulates:
    1. User asks for weather
    2. Model calls get_weather tool
    3. We send back the weather data
    4. Model should see and use that data
    """
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "City and state, e.g. 'New York, NY'"
                        }
                    },
                    "required": ["location"]
                }
            }
        }
    ]
    # Initial request that should trigger a tool call
    messages = [
        {
            "role": "user",
            "content": "What's the weather like in Tokyo right now?"
        }
    ]
    mode = "STREAMING" if streaming else "NON-STREAMING"
    print(f"\n{'='*60}")
    print(f"TEST: Tool call response flow ({mode})")
    print(f"API: {API_BASE}")
    print(f"Model: {MODEL}")
    print(f"{'='*60}\n")
    with httpx.Client(timeout=120.0) as client:
        # Step 1: Send initial request, expect tool call
        print(f"[{timestamp()}] Step 1: Sending initial request...")
        if streaming:
            tool_calls = []
            tool_call_id = None
            tool_call_name = None
            accumulated_args = ""
            with client.stream(
                "POST",
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": messages,
                    "tools": tools,
                    "tool_choice": "auto",
                    "stream": True,
                    "max_tokens": 512,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
                }
            ) as response:
                print(f"[{timestamp()}] Response status: {response.status_code}")
                for line in response.iter_lines():
                    if not line or line == "data: [DONE]":
                        continue
                    if line.startswith("data: "):
                        try:
                            chunk = json.loads(line[6:])
                            if chunk.get("choices"):
                                delta = chunk["choices"][0].get("delta", {})
                                if delta.get("tool_calls"):
                                    for tc in delta["tool_calls"]:
                                        idx = tc.get("index", 0)
                                        if tc.get("id"):
                                            tool_call_id = tc["id"]
                                        if tc.get("function", {}).get("name"):
                                            tool_call_name = tc["function"]["name"]
                                            print(f"[{timestamp()}] Tool call: {tool_call_name}")
                                        if tc.get("function", {}).get("arguments"):
                                            accumulated_args += tc["function"]["arguments"]
                                if delta.get("content"):
                                    print(f"[{timestamp()}] Content: {delta['content'][:100]}")
                        except json.JSONDecodeError as e:
                            print(f"[{timestamp()}] JSON error: {e}")
            if tool_call_name:
                tool_calls.append({
                    "id": tool_call_id or "call_0",
                    "type": "function",
                    "function": {
                        "name": tool_call_name,
                        "arguments": accumulated_args
                    }
                })
        else:
            # Non-streaming
            response = client.post(
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": messages,
                    "tools": tools,
                    "tool_choice": "auto",
                    "stream": False,
                    "max_tokens": 512,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
                }
            )
            result = response.json()
            print(f"[{timestamp()}] Response status: {response.status_code}")
            tool_calls = []
            if result.get("choices"):
                message = result["choices"][0].get("message", {})
                if message.get("tool_calls"):
                    tool_calls = message["tool_calls"]
                    for tc in tool_calls:
                        print(f"[{timestamp()}] Tool call: {tc['function']['name']}")
                        print(f"[{timestamp()}] Args: {tc['function']['arguments']}")
        # Check if we got a tool call
        if not tool_calls:
            print(f"\n[{timestamp()}] No tool call received - model didn't call the tool")
            return {"success": False, "reason": "no_tool_call"}
        # Step 2: Parse tool call and prepare response
        tc = tool_calls[0]
        tc_id = tc.get("id", "call_0")
        tc_name = tc["function"]["name"]
        tc_args = json.loads(tc["function"]["arguments"])
        print(f"\n[{timestamp()}] Step 2: Tool call received")
        print(f"  Name: {tc_name}")
        print(f"  Args: {tc_args}")
        # Simulate tool execution
        tool_result = {
            "location": tc_args.get("location", "Unknown"),
            "temperature": "22°C",
            "condition": "Partly cloudy",
            "humidity": "65%",
            "wind": "15 km/h NE"
        }
        # Step 3: Send the tool response back
        messages.append({
            "role": "assistant",
            "tool_calls": tool_calls
        })
        messages.append({
            "role": "tool",
            "tool_call_id": tc_id,
            "content": json.dumps(tool_result)
        })
        print(f"\n[{timestamp()}] Step 3: Sending tool response...")
        print(f"  Tool call ID: {tc_id}")
        print(f"  Tool result: {json.dumps(tool_result, indent=2)}")
        # Step 4: Get the model's follow-up response
        if streaming:
            final_response = ""
            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (streaming)...")
            with client.stream(
                "POST",
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": messages,
                    "tools": tools,
                    "stream": True,
                    "max_tokens": 512,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
                }
            ) as response:
                for line in response.iter_lines():
                    if not line or line == "data: [DONE]":
                        continue
                    if line.startswith("data: "):
                        try:
                            chunk = json.loads(line[6:])
                            if chunk.get("choices"):
                                delta = chunk["choices"][0].get("delta", {})
                                if delta.get("content"):
                                    content = delta["content"]
                                    final_response += content
                                    print(f"[{timestamp()}] Content: {content}", end="", flush=True)
                        except json.JSONDecodeError:
                            pass
            print()  # newline after streaming output
        else:
            print(f"\n[{timestamp()}] Step 4: Receiving model's follow-up (non-streaming)...")
            response = client.post(
                f"{API_BASE}/chat/completions",
                headers={
                    "Authorization": f"Bearer {API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": MODEL,
                    "messages": messages,
                    "tools": tools,
                    "stream": False,
                    "max_tokens": 512,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
                }
            )
            result = response.json()
            final_response = ""
            if result.get("choices"):
                final_response = result["choices"][0].get("message", {}).get("content", "")
        print(f"\n[{timestamp()}] Final response:\n{final_response}")
        # Check if the model used the tool data
        success = True
        issues = []
        # The response should mention the weather data
        if "22" not in final_response and "22°C" not in final_response:
            issues.append("Temperature (22°C) not mentioned in response")
            success = False
        if "cloudy" not in final_response.lower() and "partly cloudy" not in final_response.lower():
            issues.append("Condition (Partly cloudy) not mentioned in response")
            success = False
        # Check for signs the model didn't see the data
        blank_indicators = [
            "i don't have",
            "i cannot access",
            "i'm unable to",
            "i am unable to",
            "don't have access",
            "don't have real-time",
            "cannot provide real-time"
        ]
        for indicator in blank_indicators:
            if indicator in final_response.lower():
                issues.append(f"Model seems unaware of tool result (found: '{indicator}')")
                success = False
                break
        print(f"\n{'='*60}")
        if success:
            print("✓ PASS: Model correctly used tool response data")
        else:
            print("✗ FAIL: Model did not use tool response correctly")
            for issue in issues:
                print(f"  - {issue}")
        print(f"{'='*60}\n")
        return {
            "success": success,
            "issues": issues,
            "final_response": final_response
        }
 def test_tool_response_with_debug_info():
    """
    Test with detailed logging to capture exactly what the model sees.
    """
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_time",
                "description": "Get the current time",
                "parameters": {
                    "type": "object",
                    "properties": {},
                    "required": []
                }
            }
        }
    ]
    print(f"\n{'='*60}")
    print(f"TEST: Tool response with debug info (non-streaming)")
    print(f"{'='*60}\n")
    messages = [
        {"role": "user", "content": "What time is it?"}
    ]
    with httpx.Client(timeout=120.0) as client:
        # Get tool call
        print(f"[{timestamp()}] Sending initial request...")
        response = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "tool_choice": "auto",
                "stream": False,
                "max_tokens": 256,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        )
        result = response.json()
        if not result.get("choices") or not result["choices"][0].get("message", {}).get("tool_calls"):
            print("No tool call - skipping test")
            return
        tool_call = result["choices"][0]["message"]["tool_calls"][0]
        tc_id = tool_call["id"]
        print(f"[{timestamp()}] Tool call: {tool_call['function']['name']}")
        print(f"[{timestamp()}] Tool call ID: {tc_id}")
        # Add tool response
        messages.append({
            "role": "assistant",
            "tool_calls": [tool_call]
        })
        messages.append({
            "role": "tool",
            "tool_call_id": tc_id,
            "content": "The current time is 3:45 PM on Thursday, April 9, 2026."
        })
        # Debug: print the full messages array we're about to send
        print(f"\n[{timestamp()}] Sending follow-up with these messages:")
        print(json.dumps(messages, indent=2))
        # Get follow-up
        response2 = client.post(
            f"{API_BASE}/chat/completions",
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": MODEL,
                "messages": messages,
                "tools": tools,
                "stream": False,
                "max_tokens": 256,
                "chat_template_kwargs": {"enable_thinking": False},
                "logprobs": True,
                "top_logprobs": 5
            }
        )
        result2 = response2.json()
        print(f"\n[{timestamp()}] Full response:")
        print(json.dumps(result2, indent=2))
        if result2.get("choices"):
            content = result2["choices"][0].get("message", {}).get("content", "")
            print(f"\n[{timestamp()}] Model response content: {content}")
            # Check if time is mentioned
            if "3:45" in content or "3:45 PM" in content:
                print("\n✓ Model used the tool response (time mentioned)")
            else:
                print("\n✗ Model may not have seen the tool response (time not mentioned)")
 def main():
    print("\n" + "="*60)
    print("GLM-5.1 Tool Call Response Tests")
    print("="*60)
    # Test non-streaming first (simpler to debug)
    print("\n--- Test 1: Non-streaming tool response flow ---")
    test_tool_call_response_flow(streaming=False)
    # Test streaming
    print("\n--- Test 2: Streaming tool response flow ---")
    test_tool_call_response_flow(streaming=True)
    # Debug test
    print("\n--- Test 3: Debug info test ---")
    test_tool_response_with_debug_info()
    print("\nAll tests complete.")
 if __name__ == "__main__":
    main()