import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_name = '/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4' print('Loading tokenizer...', flush=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) print('Loading model...', flush=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map='auto', trust_remote_code=True, low_cpu_mem_usage=True ) model.eval() print('Model loaded!', flush=True) msg = [{'role':'user','content':'The capital of France is'}] ids = tokenizer.apply_chat_template(msg, add_generation_prompt=True, return_tensors='pt').cuda() print(f'Input: {ids.shape} tokens: {repr(tokenizer.decode(ids[0]))}', flush=True) with torch.no_grad(): logits = model(ids).logits[0, -1] top10 = torch.topk(logits, 10) print('HF Top-10:', flush=True) for i, (tid, val) in enumerate(zip(top10.indices, top10.values)): print(f' {i+1}. {repr(tokenizer.decode([tid.item()]))} (id={tid.item()}, logit={val.item():.3f})', flush=True) # Generate 10 tokens out = model.generate(ids, max_new_tokens=10, do_sample=False) print(f'Generated: {repr(tokenizer.decode(out[0]))}', flush=True)