diff --git a/single_shot_inference.py b/single_shot_inference.py index 5a68ebac..5d0b87c7 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -783,29 +783,27 @@ def main(): # ==== Phase 3: Inference ==== print(f"\n{'='*70}\nPhase 3: Inference\n{'='*70}") - # Apply chat template with system prompt - messages = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": PROMPT}, - ] - if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template is not None: - input_ids = tokenizer.apply_chat_template( - messages, return_tensors="pt", add_generation_prompt=True - ).cuda() - # Find where the user prompt starts for display - user_only_ids = tokenizer.encode(PROMPT, return_tensors="pt") - print(f"Chat template applied. Input: {input_ids.shape[1]} tokens") - else: - # Fallback: prepend system prompt manually - sys_ids = tokenizer.encode(SYSTEM_PROMPT, return_tensors="pt")[0] - user_ids = tokenizer.encode(PROMPT, return_tensors="pt")[0] - # Add BOS + system + newline + user - all_ids = [tokenizer.bos_token_id] if tokenizer.bos_token_id else [] - all_ids += sys_ids.tolist() + user_ids.tolist() - input_ids = torch.tensor([all_ids], dtype=torch.long).cuda() - print(f"Manual prompt assembly. Input: {input_ids.shape[1]} tokens") - print(f"Prompt: '{PROMPT}' → {input_ids.tolist()[:20]}...") - print(f"Decoded: '{tokenizer.decode(input_ids[0][:50])}'") + # DeepSeek V4 chat format: <|begin▁of▁sentence|><|User|>prompt<|Assistant|> + # For reasoning models: <|User|>prompt<|Assistant|>fithinking...flanswer + # Special token IDs: <|User|>=128803, <|Assistant|>=128804, <|EOT|>=128805 + # Thinking tokens: fi=128821, fl=128822 + USER_TOKEN = 128803 + ASSISTANT_TOKEN = 128804 + EOT_TOKEN = 128805 + THINK_START = 128821 # fi + THINK_END = 128822 # fl + + # Build input with proper DeepSeek chat format + bos_id = tokenizer.bos_token_id or 0 + # <|User|> System prompt \n\n User prompt <|Assistant|> + input_ids_list = [bos_id, USER_TOKEN] + input_ids_list += tokenizer.encode(SYSTEM_PROMPT, add_special_tokens=False) + input_ids_list += tokenizer.encode('\n\n' + PROMPT, add_special_tokens=False) + input_ids_list.append(ASSISTANT_TOKEN) + input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda() + print(f"DeepSeek chat format. Input: {input_ids.shape[1]} tokens", flush=True) + print(f"Decoded start: '{tokenizer.decode(input_ids[0][:20])}...'", flush=True) + print(f"Decoded end: '...{tokenizer.decode(input_ids[0][-5:])}'", flush=True) generated = input_ids[0].tolist()