29 lines
1.2 KiB
Python
29 lines
1.2 KiB
Python
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
model_name = '/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4'
|
|
print('Loading tokenizer...', flush=True)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
print('Loading model...', flush=True)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name, torch_dtype=torch.bfloat16,
|
|
device_map='auto', trust_remote_code=True, low_cpu_mem_usage=True
|
|
)
|
|
model.eval()
|
|
print('Model loaded!', flush=True)
|
|
|
|
msg = [{'role':'user','content':'The capital of France is'}]
|
|
ids = tokenizer.apply_chat_template(msg, add_generation_prompt=True, return_tensors='pt').cuda()
|
|
print(f'Input: {ids.shape} tokens: {repr(tokenizer.decode(ids[0]))}', flush=True)
|
|
|
|
with torch.no_grad():
|
|
logits = model(ids).logits[0, -1]
|
|
top10 = torch.topk(logits, 10)
|
|
print('HF Top-10:', flush=True)
|
|
for i, (tid, val) in enumerate(zip(top10.indices, top10.values)):
|
|
print(f' {i+1}. {repr(tokenizer.decode([tid.item()]))} (id={tid.item()}, logit={val.item():.3f})', flush=True)
|
|
|
|
# Generate 10 tokens
|
|
out = model.generate(ids, max_new_tokens=10, do_sample=False)
|
|
print(f'Generated: {repr(tokenizer.decode(out[0]))}', flush=True)
|