158 lines
5.2 KiB
Python
158 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""NVIDIA Model Optimizer PTQ for DeepSeek V4 Pro → NVFP4.
|
|
|
|
Uses nvidia-modelopt's official PTQ pipeline with NVFP4Experts-Only config,
|
|
which quantizes only MoE expert layers while keeping attention QKV in higher
|
|
precision — the recommended approach for DeepSeek MoE models.
|
|
|
|
Output is a Unified HuggingFace checkpoint deployable on TRT-LLM / vLLM / SGLang.
|
|
|
|
Usage:
|
|
python quantize_modelopt.py \
|
|
--model /root/nvidia-meeting/DeepSeek-V4-Pro \
|
|
--export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \
|
|
--qformat nvfp4_experts_only \
|
|
--tp 8 \
|
|
--calib_size 256
|
|
|
|
For the FP8 source variant, just change --model path. modelopt handles
|
|
dequantization internally.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import random
|
|
import time
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
import modelopt.torch.opt as mto
|
|
import modelopt.torch.quantization as mtq
|
|
from modelopt.torch.export import export_hf_checkpoint
|
|
from modelopt.torch.utils.dataset_utils import create_forward_loop
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
mto.enable_huggingface_checkpointing()
|
|
|
|
|
|
QUANT_CONFIGS = {
|
|
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
|
|
"nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG,
|
|
"nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
|
|
"nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG,
|
|
"fp8": mtq.FP8_DEFAULT_CFG,
|
|
}
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description="Model Optimizer PTQ for DeepSeek V4 Pro")
|
|
ap.add_argument("--model", required=True, help="Path to HF model (BF16 or FP8)")
|
|
ap.add_argument("--export_dir", required=True, help="Output directory for quantized checkpoint")
|
|
ap.add_argument("--qformat", default="nvfp4_experts_only",
|
|
choices=list(QUANT_CONFIGS.keys()),
|
|
help="Quantization format (default: nvfp4_experts_only for MoE)")
|
|
ap.add_argument("--kv_cache_qformat", default="fp8_cast",
|
|
help="KV cache quantization (default: fp8_cast, fast no-calib)")
|
|
ap.add_argument("--tp", type=int, default=8, help="Tensor parallelism for export")
|
|
ap.add_argument("--calib_size", type=int, nargs="+", default=[256],
|
|
help="Calibration dataset size (per dataset)")
|
|
ap.add_argument("--batch_size", type=int, default=1, help="Calibration batch size")
|
|
ap.add_argument("--calib_seq", type=int, default=4096, help="Max calibration sequence length")
|
|
ap.add_argument("--trust_remote_code", action="store_true", default=True,
|
|
help="Trust remote code (required for V4)")
|
|
ap.add_argument("--use_seq_device_map", action="store_true",
|
|
help="Use sequential device map for low-memory calibration")
|
|
ap.add_argument("--low_memory_mode", action="store_true",
|
|
help="Compress weights before calibration (FP8/NVFP4 only)")
|
|
args = ap.parse_args()
|
|
|
|
print(f"=== Model Optimizer PTQ ===")
|
|
print(f" Model: {args.model}")
|
|
print(f" QFormat: {args.qformat}")
|
|
print(f" KV Cache: {args.kv_cache_qformat}")
|
|
print(f" TP: {args.tp}")
|
|
print(f" Calib: {args.calib_size} samples, seq_len={args.calib_seq}")
|
|
print()
|
|
|
|
# Seed everything
|
|
random.seed(1234)
|
|
np.random.seed(1234)
|
|
torch.manual_seed(1234)
|
|
|
|
# Load tokenizer
|
|
print("Loading tokenizer...")
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
args.model,
|
|
trust_remote_code=args.trust_remote_code,
|
|
padding_side="left",
|
|
)
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
# Load model
|
|
print("Loading model...")
|
|
model_kwargs = {
|
|
"trust_remote_code": args.trust_remote_code,
|
|
"torch_dtype": torch.bfloat16,
|
|
}
|
|
if args.use_seq_device_map:
|
|
model_kwargs["device_map"] = "auto"
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
|
|
|
|
if not args.use_seq_device_map:
|
|
model = model.cuda()
|
|
|
|
# Build calibration dataloader
|
|
print("Building calibration dataset...")
|
|
calib_dataloader = create_forward_loop(
|
|
model,
|
|
dataloader=get_dataloader(
|
|
tokenizer=tokenizer,
|
|
calib_size=args.calib_size,
|
|
batch_size=args.batch_size,
|
|
calib_seq=args.calib_seq,
|
|
),
|
|
)
|
|
|
|
# Quantize
|
|
quant_cfg = QUANT_CONFIGS[args.qformat]
|
|
print(f"Running PTQ with {args.qformat}...")
|
|
t0 = time.time()
|
|
|
|
model = mtq.quantize(model, quant_cfg, calib_dataloader)
|
|
|
|
elapsed = time.time() - t0
|
|
print(f"Quantization complete in {elapsed/60:.1f} min")
|
|
|
|
# Export
|
|
print(f"Exporting to {args.export_dir} ...")
|
|
with torch.inference_mode():
|
|
export_hf_checkpoint(
|
|
model,
|
|
args.export_dir,
|
|
tokenizer=tokenizer,
|
|
export_tensorrt_llm_plugins=True,
|
|
)
|
|
|
|
print(f"Done. Output at {args.export_dir}")
|
|
|
|
|
|
def get_dataloader(tokenizer, calib_size, batch_size, calib_seq):
|
|
"""Create calibration dataloader using modelopt's built-in dataset utils."""
|
|
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
|
|
|
|
return get_dataset_dataloader(
|
|
tokenizer=tokenizer,
|
|
num_samples=calib_size[0],
|
|
batch_size=batch_size,
|
|
seq_len=calib_seq,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|