Initial chat template debugger - vLLM raw token inspector

2026-04-10 15:28:41 +00:00
commit c981416dde
6 changed files with 184 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+models/
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+FROM vllm/vllm-openai:v0.19.0
+
+WORKDIR /workspace
+
+COPY scripts/ /workspace/scripts/
+COPY prompts/ /workspace/prompts/
+
+# models/ is expected to be volume-mounted or populated by stage0
+RUN mkdir -p /workspace/models
+
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,69 @@
+# Chat Template Debugger
+
+Isolate whether tool-call failures are a **model problem** or a **parser/template problem**.
+
+Runs vLLM inside Docker, bypasses all OpenClaw middlewares, and captures raw token output from the model directly.
+
+## The Problem
+
+90% of models break on streaming tool calls. Is it the model generating garbage, or is something in the middleware stack mangling the output? This debugger lets us answer that definitively.
+
+## Plan of Attack
+
+### 1. Build & Run the Container
+
+```bash
+docker build -t ct-debug .
+docker run --gpus all -v $(pwd)/scripts:/workspace/scripts -v $(pwd)/models:/workspace/models -it ct-debug
+```
+
+### 2. Stage 0 — Download Weights (if not mounted)
+
+```bash
+# Inside the container:
+python /workspace/scripts/stage0_download.py
+```
+
+This downloads `HuggingFaceTB/SmolLM3-3B` to `/workspace/models/SmolLM3-3B` if it doesn't already exist.
+
+### 3. Stage 1 — Run the Debugger
+
+Edit `scripts/stage1_debug.py` to point at the model path and your test prompt. Then:
+
+```bash
+# Inside the container:
+python /workspace/scripts/stage1_debug.py
+```
+
+This runs the model with a raw prompt (no chat template applied by vLLM's serving layer — you control the prompt string directly). It dumps:
+
+- The raw generated text
+- The actual token IDs
+- A per-token decode so you can see exactly what the model emitted
+
+### 4. Analyze
+
+- If the model emits correct tool-call tokens → **parser/template problem**
+- If the model emits garbage or broken tokens → **model problem**, go fix the LoRA/chat template
+
+## Directory Layout
+
+```
+chat-template-debugger/
+├── Dockerfile
+├── README.md
+├── models/              # Downloaded weights (gitignored)
+├── scripts/
+│   ├── stage0_download.py
+│   └── stage1_debug.py
+└── prompts/
+    └── smol_tool_call.txt
+```
+
+## Swapping Models
+
+Change `MODEL_ID` in `stage0_download.py` and `MODEL_PATH` in `stage1_debug.py`. Works with any HF model.
+
+## Swapping Prompts
+
+Drop a `.txt` file in `prompts/` and update the path in `stage1_debug.py`. The prompt is passed as a raw string — no chat template is applied by vLLM. You control the full context.
--- a/prompts/smol_tool_call.txt
+++ b/prompts/smol_tool_call.txt
@@ -0,0 +1,8 @@
+You are a helpful assistant with access to tools.
+
+Available tools:
+- write_file: Write content to a file. Args: {"path": "string", "content": "string"}
+
+User: Write "hello world" to /tmp/test.txt
+
+Assistant:
--- a/scripts/stage0_download.py
+++ b/scripts/stage0_download.py
@@ -0,0 +1,25 @@
+"""
+Stage 0: Download model weights if they don't already exist.
+"""
+
+import os
+from huggingface_hub import snapshot_download
+
+MODEL_ID = os.environ.get("MODEL_ID", "HuggingFaceTB/SmolLM3-3B")
+MODEL_DIR = os.environ.get("MODEL_DIR", "/workspace/models/SmolLM3-3B")
+
+def main():
+    if os.path.exists(os.path.join(MODEL_DIR, "config.json")):
+        print(f"[stage0] Weights already exist at {MODEL_DIR}, skipping download.")
+        return
+
+    print(f"[stage0] Downloading {MODEL_ID} → {MODEL_DIR} ...")
+    snapshot_download(
+        repo_id=MODEL_ID,
+        local_dir=MODEL_DIR,
+    )
+    print(f"[stage0] Done. Weights saved to {MODEL_DIR}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/stage1_debug.py
+++ b/scripts/stage1_debug.py
@@ -0,0 +1,70 @@
+"""
+Stage 1: Chat Template Debugger
+
+Runs a raw prompt through vLLM's generate() API — no chat template, no serving
+layer, no middleware. Captures the exact tokens the model emits so you can
+determine whether tool-call failures are a model problem or a parser problem.
+"""
+
+import os
+import json
+from vllm import LLM, SamplingParams
+
+MODEL_PATH = os.environ.get("MODEL_PATH", "/workspace/models/SmolLM3-3B")
+PROMPT_FILE = os.environ.get("PROMPT_FILE", "/workspace/prompts/smol_tool_call.txt")
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "512"))
+TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.0"))
+
+# ── Load prompt ──────────────────────────────────────────────────────────────
+
+if os.path.exists(PROMPT_FILE):
+    with open(PROMPT_FILE, "r") as f:
+        prompt = f.read().strip()
+else:
+    print(f"[stage1] Prompt file not found: {PROMPT_FILE}")
+    print("[stage1] Using inline fallback prompt.")
+    prompt = """You are a helpful assistant with access to tools.
+
+Available tools:
+- write_file: Write content to a file. Args: {"path": "string", "content": "string"}
+
+User: Write "hello world" to /tmp/test.txt
+
+Assistant:"""
+
+print(f"[stage1] Prompt ({len(prompt)} chars):\n{'─' * 60}")
+print(prompt)
+print(f"{'─' * 60}\n")
+
+# ── Run model ────────────────────────────────────────────────────────────────
+
+print(f"[stage1] Loading model from {MODEL_PATH} ...")
+llm = LLM(model=MODEL_PATH, trust_remote_code=True)
+
+params = SamplingParams(
+    temperature=TEMPERATURE,
+    max_tokens=MAX_TOKENS,
+)
+
+print(f"[stage1] Generating (temp={TEMPERATURE}, max_tokens={MAX_TOKENS}) ...\n")
+outputs = llm.generate([prompt], params)
+
+# ── Dump results ─────────────────────────────────────────────────────────────
+
+for output in outputs:
+    generated = output.outputs[0]
+    text = generated.text
+    token_ids = list(generated.token_ids)
+
+    print(f"{'═' * 60}")
+    print(f"RAW TEXT:\n{text}")
+    print(f"{'─' * 60}")
+    print(f"TOKEN IDS ({len(token_ids)} tokens):")
+    print(json.dumps(token_ids))
+    print(f"{'─' * 60}")
+    print("PER-TOKEN DECODE:")
+    tokenizer = llm.get_tokenizer()
+    for i, tid in enumerate(token_ids):
+        decoded = tokenizer.decode([tid])
+        print(f"  [{i:4d}] id={tid:>8d}  →  {json.dumps(decoded)}")
+    print(f"{'═' * 60}")