#!/usr/bin/env python3 """NVIDIA Model Optimizer PTQ for DeepSeek V4 Pro → NVFP4. Uses nvidia-modelopt's official PTQ pipeline with NVFP4Experts-Only config, which quantizes only MoE expert layers while keeping attention QKV in higher precision — the recommended approach for DeepSeek MoE models. Output is a Unified HuggingFace checkpoint deployable on TRT-LLM / vLLM / SGLang. Usage: python quantize_modelopt.py \ --model /root/nvidia-meeting/DeepSeek-V4-Pro \ --export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \ --qformat nvfp4_experts_only \ --tp 8 \ --calib_size 256 For the FP8 source variant, just change --model path. modelopt handles dequantization internally. """ import argparse import os import random import time import numpy as np import torch import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq from modelopt.torch.export import export_hf_checkpoint from modelopt.torch.utils.dataset_utils import create_forward_loop from transformers import AutoModelForCausalLM, AutoTokenizer mto.enable_huggingface_checkpointing() QUANT_CONFIGS = { "nvfp4": mtq.NVFP4_DEFAULT_CFG, "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG, "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG, "fp8": mtq.FP8_DEFAULT_CFG, } def main(): ap = argparse.ArgumentParser(description="Model Optimizer PTQ for DeepSeek V4 Pro") ap.add_argument("--model", required=True, help="Path to HF model (BF16 or FP8)") ap.add_argument("--export_dir", required=True, help="Output directory for quantized checkpoint") ap.add_argument("--qformat", default="nvfp4_experts_only", choices=list(QUANT_CONFIGS.keys()), help="Quantization format (default: nvfp4_experts_only for MoE)") ap.add_argument("--kv_cache_qformat", default="fp8_cast", help="KV cache quantization (default: fp8_cast, fast no-calib)") ap.add_argument("--tp", type=int, default=8, help="Tensor parallelism for export") ap.add_argument("--calib_size", type=int, nargs="+", default=[256], help="Calibration dataset size (per dataset)") ap.add_argument("--batch_size", type=int, default=1, help="Calibration batch size") ap.add_argument("--calib_seq", type=int, default=4096, help="Max calibration sequence length") ap.add_argument("--trust_remote_code", action="store_true", default=True, help="Trust remote code (required for V4)") ap.add_argument("--use_seq_device_map", action="store_true", help="Use sequential device map for low-memory calibration") ap.add_argument("--low_memory_mode", action="store_true", help="Compress weights before calibration (FP8/NVFP4 only)") args = ap.parse_args() print(f"=== Model Optimizer PTQ ===") print(f" Model: {args.model}") print(f" QFormat: {args.qformat}") print(f" KV Cache: {args.kv_cache_qformat}") print(f" TP: {args.tp}") print(f" Calib: {args.calib_size} samples, seq_len={args.calib_seq}") print() # Seed everything random.seed(1234) np.random.seed(1234) torch.manual_seed(1234) # Load tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( args.model, trust_remote_code=args.trust_remote_code, padding_side="left", ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model print("Loading model...") model_kwargs = { "trust_remote_code": args.trust_remote_code, "torch_dtype": torch.bfloat16, } if args.use_seq_device_map: model_kwargs["device_map"] = "auto" model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs) if not args.use_seq_device_map: model = model.cuda() # Build calibration dataloader print("Building calibration dataset...") calib_dataloader = create_forward_loop( model, dataloader=get_dataloader( tokenizer=tokenizer, calib_size=args.calib_size, batch_size=args.batch_size, calib_seq=args.calib_seq, ), ) # Quantize quant_cfg = QUANT_CONFIGS[args.qformat] print(f"Running PTQ with {args.qformat}...") t0 = time.time() model = mtq.quantize(model, quant_cfg, calib_dataloader) elapsed = time.time() - t0 print(f"Quantization complete in {elapsed/60:.1f} min") # Export print(f"Exporting to {args.export_dir} ...") with torch.inference_mode(): export_hf_checkpoint( model, args.export_dir, tokenizer=tokenizer, export_tensorrt_llm_plugins=True, ) print(f"Done. Output at {args.export_dir}") def get_dataloader(tokenizer, calib_size, batch_size, calib_seq): """Create calibration dataloader using modelopt's built-in dataset utils.""" from modelopt.torch.utils.dataset_utils import get_dataset_dataloader return get_dataset_dataloader( tokenizer=tokenizer, num_samples=calib_size[0], batch_size=batch_size, seq_len=calib_seq, ) if __name__ == "__main__": main()