diff --git a/single_shot_inference.py b/single_shot_inference.py
index 5a68ebac..5d0b87c7 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -783,29 +783,27 @@ def main():
 
     # ==== Phase 3: Inference ====
     print(f"\n{'='*70}\nPhase 3: Inference\n{'='*70}")
-    # Apply chat template with system prompt
-    messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": PROMPT},
-    ]
-    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template is not None:
-        input_ids = tokenizer.apply_chat_template(
-            messages, return_tensors="pt", add_generation_prompt=True
-        ).cuda()
-        # Find where the user prompt starts for display
-        user_only_ids = tokenizer.encode(PROMPT, return_tensors="pt")
-        print(f"Chat template applied. Input: {input_ids.shape[1]} tokens")
-    else:
-        # Fallback: prepend system prompt manually
-        sys_ids = tokenizer.encode(SYSTEM_PROMPT, return_tensors="pt")[0]
-        user_ids = tokenizer.encode(PROMPT, return_tensors="pt")[0]
-        # Add BOS + system + newline + user
-        all_ids = [tokenizer.bos_token_id] if tokenizer.bos_token_id else []
-        all_ids += sys_ids.tolist() + user_ids.tolist()
-        input_ids = torch.tensor([all_ids], dtype=torch.long).cuda()
-        print(f"Manual prompt assembly. Input: {input_ids.shape[1]} tokens")
-    print(f"Prompt: '{PROMPT}' → {input_ids.tolist()[:20]}...")
-    print(f"Decoded: '{tokenizer.decode(input_ids[0][:50])}'")
+    # DeepSeek V4 chat format: <｜begin▁of▁sentence｜><｜User｜>prompt<｜Assistant｜>
+    # For reasoning models: <｜User｜>prompt<｜Assistant｜>ﬁthinking...ﬂanswer
+    # Special token IDs: <｜User｜>=128803, <｜Assistant｜>=128804, <|EOT|>=128805
+    # Thinking tokens: ﬁ=128821, ﬂ=128822
+    USER_TOKEN = 128803
+    ASSISTANT_TOKEN = 128804
+    EOT_TOKEN = 128805
+    THINK_START = 128821  # ﬁ
+    THINK_END = 128822    # ﬂ
+
+    # Build input with proper DeepSeek chat format
+    bos_id = tokenizer.bos_token_id or 0
+    # <BOS> <｜User｜> System prompt \n\n User prompt <｜Assistant｜>
+    input_ids_list = [bos_id, USER_TOKEN]
+    input_ids_list += tokenizer.encode(SYSTEM_PROMPT, add_special_tokens=False)
+    input_ids_list += tokenizer.encode('\n\n' + PROMPT, add_special_tokens=False)
+    input_ids_list.append(ASSISTANT_TOKEN)
+    input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda()
+    print(f"DeepSeek chat format. Input: {input_ids.shape[1]} tokens", flush=True)
+    print(f"Decoded start: '{tokenizer.decode(input_ids[0][:20])}...'", flush=True)
+    print(f"Decoded end: '...{tokenizer.decode(input_ids[0][-5:])}'", flush=True)
 
     generated = input_ids[0].tolist()