Files
nvfp4-megamoe-kernel/tests/compare_hf_reference.py

93 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Compare single_shot_inference output against HuggingFace Transformers reference.
This script loads the DeepSeek V4 model using the official HuggingFace
implementation and processes the same input, comparing intermediate values
at each step to identify discrepancies in our single_shot_inference.py.
Usage (on B200):
source /root/dsv4-nvfp4-workspace/venv/bin/activate
cd /root/dsv4-nvfp4-workspace/kernel
python tests/compare_hf_reference.py
"""
import os, sys, json, math
import torch
from pathlib import Path
CHECKPOINT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
def main():
print("Loading HuggingFace reference model...")
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load with BF16 on CPU (we'll move to GPU as needed)
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)
# Try loading the model - this may fail if the model class isn't in transformers yet
try:
model = AutoModelForCausalLM.from_pretrained(
CHECKPOINT_DIR,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
print("Model loaded successfully!")
except Exception as e:
print(f"Failed to load model: {e}")
print("Trying with trust_remote_code=True and quantization_config bypass...")
# The NVFP4 quantization might not be supported by HF yet
# Try loading with from_config approach
from transformers import AutoConfig
config = AutoConfig.from_pretrained(CHECKPOINT_DIR, trust_remote_code=True)
print(f"Config loaded: {config.model_type}")
print(f"Architectures: {config.architectures}")
return
# Process the same input
prompt = "The capital of France is"
USER_TOKEN = 128803
ASSISTANT_TOKEN = 128804
# Build input
bos_id = tokenizer.bos_token_id or 0
input_ids_list = [bos_id, USER_TOKEN]
input_ids_list += tokenizer.encode(prompt, add_special_tokens=False)
input_ids_list.append(ASSISTANT_TOKEN)
input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda()
print(f"Input: {input_ids.shape[1]} tokens")
print(f"Decoded: {tokenizer.decode(input_ids[0][:30])}")
# Generate
with torch.no_grad():
output = model.generate(
input_ids,
max_new_tokens=10,
do_sample=False,
temperature=1.0,
)
generated = output[0, input_ids.shape[1]:]
print(f"Generated: {tokenizer.decode(generated)}")
# Also get logits for the first position
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
# Top-5 at the last position
last_logits = logits[0, -1]
top5v, top5i = torch.topk(last_logits, 5)
print(f"\nTop-5 at last position:")
for v, i in zip(top5v, top5i):
print(f" {tokenizer.decode([i.item()])} ({i.item()}, {v.item():.3f})")
# Check thinking token
think_logit = last_logits[128821].item()
print(f"Thinking token (128821) logit: {think_logit:.3f}")
print(f"Thinking token rank: {(last_logits > think_logit).sum().item()} / {last_logits.shape[0]}")
if __name__ == "__main__":
main()