93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Compare single_shot_inference output against HuggingFace Transformers reference.
|
|
|
|
This script loads the DeepSeek V4 model using the official HuggingFace
|
|
implementation and processes the same input, comparing intermediate values
|
|
at each step to identify discrepancies in our single_shot_inference.py.
|
|
|
|
Usage (on B200):
|
|
source /root/dsv4-nvfp4-workspace/venv/bin/activate
|
|
cd /root/dsv4-nvfp4-workspace/kernel
|
|
python tests/compare_hf_reference.py
|
|
"""
|
|
import os, sys, json, math
|
|
import torch
|
|
from pathlib import Path
|
|
|
|
CHECKPOINT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
|
|
|
def main():
|
|
print("Loading HuggingFace reference model...")
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
# Load with BF16 on CPU (we'll move to GPU as needed)
|
|
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)
|
|
|
|
# Try loading the model - this may fail if the model class isn't in transformers yet
|
|
try:
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
CHECKPOINT_DIR,
|
|
torch_dtype=torch.bfloat16,
|
|
device_map="auto",
|
|
trust_remote_code=True,
|
|
)
|
|
print("Model loaded successfully!")
|
|
except Exception as e:
|
|
print(f"Failed to load model: {e}")
|
|
print("Trying with trust_remote_code=True and quantization_config bypass...")
|
|
# The NVFP4 quantization might not be supported by HF yet
|
|
# Try loading with from_config approach
|
|
from transformers import AutoConfig
|
|
config = AutoConfig.from_pretrained(CHECKPOINT_DIR, trust_remote_code=True)
|
|
print(f"Config loaded: {config.model_type}")
|
|
print(f"Architectures: {config.architectures}")
|
|
return
|
|
|
|
# Process the same input
|
|
prompt = "The capital of France is"
|
|
USER_TOKEN = 128803
|
|
ASSISTANT_TOKEN = 128804
|
|
|
|
# Build input
|
|
bos_id = tokenizer.bos_token_id or 0
|
|
input_ids_list = [bos_id, USER_TOKEN]
|
|
input_ids_list += tokenizer.encode(prompt, add_special_tokens=False)
|
|
input_ids_list.append(ASSISTANT_TOKEN)
|
|
input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda()
|
|
|
|
print(f"Input: {input_ids.shape[1]} tokens")
|
|
print(f"Decoded: {tokenizer.decode(input_ids[0][:30])}")
|
|
|
|
# Generate
|
|
with torch.no_grad():
|
|
output = model.generate(
|
|
input_ids,
|
|
max_new_tokens=10,
|
|
do_sample=False,
|
|
temperature=1.0,
|
|
)
|
|
|
|
generated = output[0, input_ids.shape[1]:]
|
|
print(f"Generated: {tokenizer.decode(generated)}")
|
|
|
|
# Also get logits for the first position
|
|
with torch.no_grad():
|
|
outputs = model(input_ids)
|
|
logits = outputs.logits
|
|
|
|
# Top-5 at the last position
|
|
last_logits = logits[0, -1]
|
|
top5v, top5i = torch.topk(last_logits, 5)
|
|
print(f"\nTop-5 at last position:")
|
|
for v, i in zip(top5v, top5i):
|
|
print(f" {tokenizer.decode([i.item()])} ({i.item()}, {v.item():.3f})")
|
|
|
|
# Check thinking token
|
|
think_logit = last_logits[128821].item()
|
|
print(f"Thinking token (128821) logit: {think_logit:.3f}")
|
|
print(f"Thinking token rank: {(last_logits > think_logit).sum().item()} / {last_logits.shape[0]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|