diff --git a/train_lora.py b/train_lora.py index 326f280..190f495 100644 --- a/train_lora.py +++ b/train_lora.py @@ -154,6 +154,7 @@ def main(): target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", + "embed_tokens", # Critical: lets LoRA adjust tool-call token embeddings ], bias="none", ) @@ -166,6 +167,21 @@ def main(): val_data = load_jsonl(data_dir / "val.jsonl") print(f"Train samples: {len(train_data)}, Val samples: {len(val_data)}") + # ── Verify tool-call tokens are in the training data ───────────────── + print("Verifying tool-call token IDs in training data ...") + verification_sample = train_data[0] + v_text = tokenizer.apply_chat_template(verification_sample["messages"], tokenize=False) + v_ids = tokenizer.encode(v_text) + tc_start_found = 128015 in v_ids + tc_end_found = 128016 in v_ids + if tc_start_found and tc_end_found: + print(f" ✓ Tool-call tokens verified in sample data (128015={tc_start_found}, 128016={tc_end_found})") + else: + print(f" ✗ WARNING: Tool-call tokens missing! (128015={tc_start_found}, 128016={tc_end_found})") + print(f" ✗ Training may NOT teach the model to emit tool-call tokens.") + print(f" ✗ Check prepare_data.py and the tokenizer chat template.") + # Don't abort — let the user decide, but warn loudly + print("Tokenizing training data ...") train_dataset = Dataset.from_list(train_data).map( lambda x: tokenize_for_training(x, tokenizer, args.max_length),