Files
nvfp4-megamoe-kernel/hf_reference_test.py

29 lines
1.2 KiB
Python

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = '/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4'
print('Loading tokenizer...', flush=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print('Loading model...', flush=True)
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
device_map='auto', trust_remote_code=True, low_cpu_mem_usage=True
)
model.eval()
print('Model loaded!', flush=True)
msg = [{'role':'user','content':'The capital of France is'}]
ids = tokenizer.apply_chat_template(msg, add_generation_prompt=True, return_tensors='pt').cuda()
print(f'Input: {ids.shape} tokens: {repr(tokenizer.decode(ids[0]))}', flush=True)
with torch.no_grad():
logits = model(ids).logits[0, -1]
top10 = torch.topk(logits, 10)
print('HF Top-10:', flush=True)
for i, (tid, val) in enumerate(zip(top10.indices, top10.values)):
print(f' {i+1}. {repr(tokenizer.decode([tid.item()]))} (id={tid.item()}, logit={val.item():.3f})', flush=True)
# Generate 10 tokens
out = model.generate(ids, max_new_tokens=10, do_sample=False)
print(f'Generated: {repr(tokenizer.decode(out[0]))}', flush=True)