#!/usr/bin/env python3 """Compare single_shot_inference output against HuggingFace Transformers reference. This script loads the DeepSeek V4 model using the official HuggingFace implementation and processes the same input, comparing intermediate values at each step to identify discrepancies in our single_shot_inference.py. Usage (on B200): source /root/dsv4-nvfp4-workspace/venv/bin/activate cd /root/dsv4-nvfp4-workspace/kernel python tests/compare_hf_reference.py """ import os, sys, json, math import torch from pathlib import Path CHECKPOINT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" def main(): print("Loading HuggingFace reference model...") from transformers import AutoModelForCausalLM, AutoTokenizer # Load with BF16 on CPU (we'll move to GPU as needed) tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR) # Try loading the model - this may fail if the model class isn't in transformers yet try: model = AutoModelForCausalLM.from_pretrained( CHECKPOINT_DIR, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) print("Model loaded successfully!") except Exception as e: print(f"Failed to load model: {e}") print("Trying with trust_remote_code=True and quantization_config bypass...") # The NVFP4 quantization might not be supported by HF yet # Try loading with from_config approach from transformers import AutoConfig config = AutoConfig.from_pretrained(CHECKPOINT_DIR, trust_remote_code=True) print(f"Config loaded: {config.model_type}") print(f"Architectures: {config.architectures}") return # Process the same input prompt = "The capital of France is" USER_TOKEN = 128803 ASSISTANT_TOKEN = 128804 # Build input bos_id = tokenizer.bos_token_id or 0 input_ids_list = [bos_id, USER_TOKEN] input_ids_list += tokenizer.encode(prompt, add_special_tokens=False) input_ids_list.append(ASSISTANT_TOKEN) input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda() print(f"Input: {input_ids.shape[1]} tokens") print(f"Decoded: {tokenizer.decode(input_ids[0][:30])}") # Generate with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=10, do_sample=False, temperature=1.0, ) generated = output[0, input_ids.shape[1]:] print(f"Generated: {tokenizer.decode(generated)}") # Also get logits for the first position with torch.no_grad(): outputs = model(input_ids) logits = outputs.logits # Top-5 at the last position last_logits = logits[0, -1] top5v, top5i = torch.topk(last_logits, 5) print(f"\nTop-5 at last position:") for v, i in zip(top5v, top5i): print(f" {tokenizer.decode([i.item()])} ({i.item()}, {v.item():.3f})") # Check thinking token think_logit = last_logits[128821].item() print(f"Thinking token (128821) logit: {think_logit:.3f}") print(f"Thinking token rank: {(last_logits > think_logit).sum().item()} / {last_logits.shape[0]}") if __name__ == "__main__": main()