Fix prompt format: use DeepSeek V4 chat tokens

The model was trained with DeepSeek-specific chat tokens: <｜User｜> (128803), <｜Assistant｜> (128804), <|EOT|> (128805) Thinking: ﬁ (128821), ﬂ (128822) Previous manual assembly just concatenated raw text without these tokens, causing the model to not recognize user/assistant boundaries. Format: <BOS><｜User｜>system prompt\n\nuser prompt<｜Assistant｜>
2026-05-31 10:33:41 +00:00
parent f86742ef8e
commit d891ae7e96
1 changed files with 21 additions and 23 deletions
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -783,29 +783,27 @@ def main():

    # ==== Phase 3: Inference ====
    print(f"\n{'='*70}\nPhase 3: Inference\n{'='*70}")
-    # Apply chat template with system prompt
-    messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": PROMPT},
-    ]
-    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template is not None:
-        input_ids = tokenizer.apply_chat_template(
-            messages, return_tensors="pt", add_generation_prompt=True
-        ).cuda()
-        # Find where the user prompt starts for display
-        user_only_ids = tokenizer.encode(PROMPT, return_tensors="pt")
-        print(f"Chat template applied. Input: {input_ids.shape[1]} tokens")
-    else:
-        # Fallback: prepend system prompt manually
-        sys_ids = tokenizer.encode(SYSTEM_PROMPT, return_tensors="pt")[0]
-        user_ids = tokenizer.encode(PROMPT, return_tensors="pt")[0]
-        # Add BOS + system + newline + user
-        all_ids = [tokenizer.bos_token_id] if tokenizer.bos_token_id else []
-        all_ids += sys_ids.tolist() + user_ids.tolist()
-        input_ids = torch.tensor([all_ids], dtype=torch.long).cuda()
-        print(f"Manual prompt assembly. Input: {input_ids.shape[1]} tokens")
-    print(f"Prompt: '{PROMPT}' → {input_ids.tolist()[:20]}...")
-    print(f"Decoded: '{tokenizer.decode(input_ids[0][:50])}'")
+    # DeepSeek V4 chat format: <｜begin▁of▁sentence｜><｜User｜>prompt<｜Assistant｜>
+    # For reasoning models: <｜User｜>prompt<｜Assistant｜>ﬁthinking...ﬂanswer
+    # Special token IDs: <｜User｜>=128803, <｜Assistant｜>=128804, <|EOT|>=128805
+    # Thinking tokens: ﬁ=128821, ﬂ=128822
+    USER_TOKEN = 128803
+    ASSISTANT_TOKEN = 128804
+    EOT_TOKEN = 128805
+    THINK_START = 128821  # ﬁ
+    THINK_END = 128822    # ﬂ
+
+    # Build input with proper DeepSeek chat format
+    bos_id = tokenizer.bos_token_id or 0
+    # <BOS> <｜User｜> System prompt \n\n User prompt <｜Assistant｜>
+    input_ids_list = [bos_id, USER_TOKEN]
+    input_ids_list += tokenizer.encode(SYSTEM_PROMPT, add_special_tokens=False)
+    input_ids_list += tokenizer.encode('\n\n' + PROMPT, add_special_tokens=False)
+    input_ids_list.append(ASSISTANT_TOKEN)
+    input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda()
+    print(f"DeepSeek chat format. Input: {input_ids.shape[1]} tokens", flush=True)
+    print(f"Decoded start: '{tokenizer.decode(input_ids[0][:20])}...'", flush=True)
+    print(f"Decoded end: '...{tokenizer.decode(input_ids[0][-5:])}'", flush=True)

    generated = input_ids[0].tolist()