nvfp4-megamoe-kernel/tests/compare_hf_reference.py

#!/usr/bin/env python3
"""Compare single_shot_inference output against HuggingFace Transformers reference.

This script loads the DeepSeek V4 model using the official HuggingFace
implementation and processes the same input, comparing intermediate values
at each step to identify discrepancies in our single_shot_inference.py.

Usage (on B200):
    source /root/dsv4-nvfp4-workspace/venv/bin/activate
    cd /root/dsv4-nvfp4-workspace/kernel
    python tests/compare_hf_reference.py
"""
import os, sys, json, math
import torch
from pathlib import Path

CHECKPOINT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"

def main():
    print("Loading HuggingFace reference model...")
    from transformers import AutoModelForCausalLM, AutoTokenizer

    # Load with BF16 on CPU (we'll move to GPU as needed)
    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)

    # Try loading the model - this may fail if the model class isn't in transformers yet
    try:
        model = AutoModelForCausalLM.from_pretrained(
            CHECKPOINT_DIR,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Failed to load model: {e}")
        print("Trying with trust_remote_code=True and quantization_config bypass...")
        # The NVFP4 quantization might not be supported by HF yet
        # Try loading with from_config approach
        from transformers import AutoConfig
        config = AutoConfig.from_pretrained(CHECKPOINT_DIR, trust_remote_code=True)
        print(f"Config loaded: {config.model_type}")
        print(f"Architectures: {config.architectures}")
        return

    # Process the same input
    prompt = "The capital of France is"
    USER_TOKEN = 128803
    ASSISTANT_TOKEN = 128804

    # Build input
    bos_id = tokenizer.bos_token_id or 0
    input_ids_list = [bos_id, USER_TOKEN]
    input_ids_list += tokenizer.encode(prompt, add_special_tokens=False)
    input_ids_list.append(ASSISTANT_TOKEN)
    input_ids = torch.tensor([input_ids_list], dtype=torch.long).cuda()

    print(f"Input: {input_ids.shape[1]} tokens")
    print(f"Decoded: {tokenizer.decode(input_ids[0][:30])}")

    # Generate
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=10,
            do_sample=False,
            temperature=1.0,
        )

    generated = output[0, input_ids.shape[1]:]
    print(f"Generated: {tokenizer.decode(generated)}")

    # Also get logits for the first position
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Top-5 at the last position
    last_logits = logits[0, -1]
    top5v, top5i = torch.topk(last_logits, 5)
    print(f"\nTop-5 at last position:")
    for v, i in zip(top5v, top5i):
        print(f"  {tokenizer.decode([i.item()])} ({i.item()}, {v.item():.3f})")

    # Check thinking token
    think_logit = last_logits[128821].item()
    print(f"Thinking token (128821) logit: {think_logit:.3f}")
    print(f"Thinking token rank: {(last_logits > think_logit).sum().item()} / {last_logits.shape[0]}")


if __name__ == "__main__":
    main()