commit c981416dde459028d2c5622cf57d61f7e5869ae9 Author: biondizzle Date: Fri Apr 10 15:28:41 2026 +0000 Initial chat template debugger - vLLM raw token inspector diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2bcdfd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +models/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3304a5c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM vllm/vllm-openai:v0.19.0 + +WORKDIR /workspace + +COPY scripts/ /workspace/scripts/ +COPY prompts/ /workspace/prompts/ + +# models/ is expected to be volume-mounted or populated by stage0 +RUN mkdir -p /workspace/models + +CMD ["/bin/bash"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..acf10cf --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +# Chat Template Debugger + +Isolate whether tool-call failures are a **model problem** or a **parser/template problem**. + +Runs vLLM inside Docker, bypasses all OpenClaw middlewares, and captures raw token output from the model directly. + +## The Problem + +90% of models break on streaming tool calls. Is it the model generating garbage, or is something in the middleware stack mangling the output? This debugger lets us answer that definitively. + +## Plan of Attack + +### 1. Build & Run the Container + +```bash +docker build -t ct-debug . +docker run --gpus all -v $(pwd)/scripts:/workspace/scripts -v $(pwd)/models:/workspace/models -it ct-debug +``` + +### 2. Stage 0 — Download Weights (if not mounted) + +```bash +# Inside the container: +python /workspace/scripts/stage0_download.py +``` + +This downloads `HuggingFaceTB/SmolLM3-3B` to `/workspace/models/SmolLM3-3B` if it doesn't already exist. + +### 3. Stage 1 — Run the Debugger + +Edit `scripts/stage1_debug.py` to point at the model path and your test prompt. Then: + +```bash +# Inside the container: +python /workspace/scripts/stage1_debug.py +``` + +This runs the model with a raw prompt (no chat template applied by vLLM's serving layer — you control the prompt string directly). It dumps: + +- The raw generated text +- The actual token IDs +- A per-token decode so you can see exactly what the model emitted + +### 4. Analyze + +- If the model emits correct tool-call tokens → **parser/template problem** +- If the model emits garbage or broken tokens → **model problem**, go fix the LoRA/chat template + +## Directory Layout + +``` +chat-template-debugger/ +├── Dockerfile +├── README.md +├── models/ # Downloaded weights (gitignored) +├── scripts/ +│ ├── stage0_download.py +│ └── stage1_debug.py +└── prompts/ + └── smol_tool_call.txt +``` + +## Swapping Models + +Change `MODEL_ID` in `stage0_download.py` and `MODEL_PATH` in `stage1_debug.py`. Works with any HF model. + +## Swapping Prompts + +Drop a `.txt` file in `prompts/` and update the path in `stage1_debug.py`. The prompt is passed as a raw string — no chat template is applied by vLLM. You control the full context. diff --git a/prompts/smol_tool_call.txt b/prompts/smol_tool_call.txt new file mode 100644 index 0000000..f19e753 --- /dev/null +++ b/prompts/smol_tool_call.txt @@ -0,0 +1,8 @@ +You are a helpful assistant with access to tools. + +Available tools: +- write_file: Write content to a file. Args: {"path": "string", "content": "string"} + +User: Write "hello world" to /tmp/test.txt + +Assistant: \ No newline at end of file diff --git a/scripts/stage0_download.py b/scripts/stage0_download.py new file mode 100644 index 0000000..1efc171 --- /dev/null +++ b/scripts/stage0_download.py @@ -0,0 +1,25 @@ +""" +Stage 0: Download model weights if they don't already exist. +""" + +import os +from huggingface_hub import snapshot_download + +MODEL_ID = os.environ.get("MODEL_ID", "HuggingFaceTB/SmolLM3-3B") +MODEL_DIR = os.environ.get("MODEL_DIR", "/workspace/models/SmolLM3-3B") + +def main(): + if os.path.exists(os.path.join(MODEL_DIR, "config.json")): + print(f"[stage0] Weights already exist at {MODEL_DIR}, skipping download.") + return + + print(f"[stage0] Downloading {MODEL_ID} → {MODEL_DIR} ...") + snapshot_download( + repo_id=MODEL_ID, + local_dir=MODEL_DIR, + ) + print(f"[stage0] Done. Weights saved to {MODEL_DIR}") + + +if __name__ == "__main__": + main() diff --git a/scripts/stage1_debug.py b/scripts/stage1_debug.py new file mode 100644 index 0000000..9055a32 --- /dev/null +++ b/scripts/stage1_debug.py @@ -0,0 +1,70 @@ +""" +Stage 1: Chat Template Debugger + +Runs a raw prompt through vLLM's generate() API — no chat template, no serving +layer, no middleware. Captures the exact tokens the model emits so you can +determine whether tool-call failures are a model problem or a parser problem. +""" + +import os +import json +from vllm import LLM, SamplingParams + +MODEL_PATH = os.environ.get("MODEL_PATH", "/workspace/models/SmolLM3-3B") +PROMPT_FILE = os.environ.get("PROMPT_FILE", "/workspace/prompts/smol_tool_call.txt") +MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "512")) +TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.0")) + +# ── Load prompt ────────────────────────────────────────────────────────────── + +if os.path.exists(PROMPT_FILE): + with open(PROMPT_FILE, "r") as f: + prompt = f.read().strip() +else: + print(f"[stage1] Prompt file not found: {PROMPT_FILE}") + print("[stage1] Using inline fallback prompt.") + prompt = """You are a helpful assistant with access to tools. + +Available tools: +- write_file: Write content to a file. Args: {"path": "string", "content": "string"} + +User: Write "hello world" to /tmp/test.txt + +Assistant:""" + +print(f"[stage1] Prompt ({len(prompt)} chars):\n{'─' * 60}") +print(prompt) +print(f"{'─' * 60}\n") + +# ── Run model ──────────────────────────────────────────────────────────────── + +print(f"[stage1] Loading model from {MODEL_PATH} ...") +llm = LLM(model=MODEL_PATH, trust_remote_code=True) + +params = SamplingParams( + temperature=TEMPERATURE, + max_tokens=MAX_TOKENS, +) + +print(f"[stage1] Generating (temp={TEMPERATURE}, max_tokens={MAX_TOKENS}) ...\n") +outputs = llm.generate([prompt], params) + +# ── Dump results ───────────────────────────────────────────────────────────── + +for output in outputs: + generated = output.outputs[0] + text = generated.text + token_ids = list(generated.token_ids) + + print(f"{'═' * 60}") + print(f"RAW TEXT:\n{text}") + print(f"{'─' * 60}") + print(f"TOKEN IDS ({len(token_ids)} tokens):") + print(json.dumps(token_ids)) + print(f"{'─' * 60}") + print("PER-TOKEN DECODE:") + tokenizer = llm.get_tokenizer() + for i, tid in enumerate(token_ids): + decoded = tokenizer.decode([tid]) + print(f" [{i:4d}] id={tid:>8d} → {json.dumps(decoded)}") + print(f"{'═' * 60}")