Files
deepseek-v4-quant/quantize_modelopt.py

159 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""NVIDIA Model Optimizer PTQ for DeepSeek V4 Pro → NVFP4.
Uses nvidia-modelopt's official PTQ pipeline with NVFP4Experts-Only config,
which quantizes only MoE expert layers while keeping attention QKV in higher
precision — the recommended approach for DeepSeek MoE models.
Output is a Unified HuggingFace checkpoint deployable on TRT-LLM / vLLM / SGLang.
Usage:
python quantize_modelopt.py \
--model /root/nvidia-meeting/DeepSeek-V4-Pro \
--export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \
--qformat nvfp4_experts_only \
--tp 8 \
--calib_size 256
For the FP8 source variant, just change --model path. modelopt handles
dequantization internally.
"""
import argparse
import os
import random
import time
import numpy as np
import torch
import modelopt.torch.opt as mto
import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_hf_checkpoint
from modelopt.torch.utils.dataset_utils import create_forward_loop
from transformers import AutoModelForCausalLM, AutoTokenizer
mto.enable_huggingface_checkpointing()
QUANT_CONFIGS = {
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
"nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG,
"nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
"nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
}
def main():
ap = argparse.ArgumentParser(description="Model Optimizer PTQ for DeepSeek V4 Pro")
ap.add_argument("--model", required=True, help="Path to HF model (BF16 or FP8)")
ap.add_argument("--export_dir", required=True, help="Output directory for quantized checkpoint")
ap.add_argument("--qformat", default="nvfp4_experts_only",
choices=list(QUANT_CONFIGS.keys()),
help="Quantization format (default: nvfp4_experts_only for MoE)")
ap.add_argument("--kv_cache_qformat", default="fp8_cast",
help="KV cache quantization (default: fp8_cast, fast no-calib)")
ap.add_argument("--tp", type=int, default=8, help="Tensor parallelism for export")
ap.add_argument("--calib_size", type=int, nargs="+", default=[256],
help="Calibration dataset size (per dataset)")
ap.add_argument("--batch_size", type=int, default=1, help="Calibration batch size")
ap.add_argument("--calib_seq", type=int, default=4096, help="Max calibration sequence length")
ap.add_argument("--trust_remote_code", action="store_true", default=True,
help="Trust remote code (required for V4)")
ap.add_argument("--use_seq_device_map", action="store_true",
help="Use sequential device map for low-memory calibration")
ap.add_argument("--low_memory_mode", action="store_true",
help="Compress weights before calibration (FP8/NVFP4 only)")
args = ap.parse_args()
print(f"=== Model Optimizer PTQ ===")
print(f" Model: {args.model}")
print(f" QFormat: {args.qformat}")
print(f" KV Cache: {args.kv_cache_qformat}")
print(f" TP: {args.tp}")
print(f" Calib: {args.calib_size} samples, seq_len={args.calib_seq}")
print()
# Seed everything
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
args.model,
trust_remote_code=args.trust_remote_code,
padding_side="left",
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model
print("Loading model...")
model_kwargs = {
"trust_remote_code": args.trust_remote_code,
"torch_dtype": torch.bfloat16,
}
if args.use_seq_device_map:
model_kwargs["device_map"] = "auto"
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
if not args.use_seq_device_map:
model = model.cuda()
# Build calibration dataloader
print("Building calibration dataset...")
calib_dataloader = get_dataloader(
tokenizer=tokenizer,
calib_size=args.calib_size,
batch_size=args.batch_size,
calib_seq=args.calib_seq,
)
# Create forward loop from dataloader (modelopt helper)
from modelopt.torch.utils.dataset_utils import create_forward_loop
forward_loop = create_forward_loop(calib_dataloader)
# Quantize
quant_cfg = QUANT_CONFIGS[args.qformat]
print(f"Running PTQ with {args.qformat}...")
t0 = time.time()
model = mtq.quantize(model, quant_cfg, forward_loop)
elapsed = time.time() - t0
print(f"Quantization complete in {elapsed/60:.1f} min")
# Export
print(f"Exporting to {args.export_dir} ...")
with torch.inference_mode():
export_hf_checkpoint(
model,
args.export_dir,
tokenizer=tokenizer,
export_tensorrt_llm_plugins=True,
)
print(f"Done. Output at {args.export_dir}")
def get_dataloader(tokenizer, calib_size, batch_size, calib_seq):
"""Create calibration dataloader using modelopt's built-in dataset utils."""
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
return get_dataset_dataloader(
tokenizer=tokenizer,
num_samples=calib_size[0],
batch_size=batch_size,
max_sample_length=calib_seq,
)
if __name__ == "__main__":
main()