deepseek-v4-quant/quantize_modelopt.py

#!/usr/bin/env python3
"""NVIDIA Model Optimizer PTQ for DeepSeek V4 Pro → NVFP4.

Uses nvidia-modelopt's official PTQ pipeline with NVFP4Experts-Only config,
which quantizes only MoE expert layers while keeping attention QKV in higher
precision — the recommended approach for DeepSeek MoE models.

Output is a Unified HuggingFace checkpoint deployable on TRT-LLM / vLLM / SGLang.

Usage:
    python quantize_modelopt.py \
        --model /root/nvidia-meeting/DeepSeek-V4-Pro \
        --export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \
        --qformat nvfp4_experts_only \
        --tp 8 \
        --calib_size 256

For the FP8 source variant, just change --model path. modelopt handles
dequantization internally.
"""

import argparse
import os
import random
import time

import numpy as np
import torch

import modelopt.torch.opt as mto
import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_hf_checkpoint
from modelopt.torch.utils.dataset_utils import create_forward_loop

from transformers import AutoModelForCausalLM, AutoTokenizer


mto.enable_huggingface_checkpointing()


QUANT_CONFIGS = {
    "nvfp4": mtq.NVFP4_DEFAULT_CFG,
    "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG,
    "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
    "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG,
    "fp8": mtq.FP8_DEFAULT_CFG,
}


def main():
    ap = argparse.ArgumentParser(description="Model Optimizer PTQ for DeepSeek V4 Pro")
    ap.add_argument("--model", required=True, help="Path to HF model (BF16 or FP8)")
    ap.add_argument("--export_dir", required=True, help="Output directory for quantized checkpoint")
    ap.add_argument("--qformat", default="nvfp4_experts_only",
                    choices=list(QUANT_CONFIGS.keys()),
                    help="Quantization format (default: nvfp4_experts_only for MoE)")
    ap.add_argument("--kv_cache_qformat", default="fp8_cast",
                    help="KV cache quantization (default: fp8_cast, fast no-calib)")
    ap.add_argument("--tp", type=int, default=8, help="Tensor parallelism for export")
    ap.add_argument("--calib_size", type=int, nargs="+", default=[256],
                    help="Calibration dataset size (per dataset)")
    ap.add_argument("--batch_size", type=int, default=1, help="Calibration batch size")
    ap.add_argument("--calib_seq", type=int, default=4096, help="Max calibration sequence length")
    ap.add_argument("--trust_remote_code", action="store_true", default=True,
                    help="Trust remote code (required for V4)")
    ap.add_argument("--use_seq_device_map", action="store_true",
                    help="Use sequential device map for low-memory calibration")
    ap.add_argument("--low_memory_mode", action="store_true",
                    help="Compress weights before calibration (FP8/NVFP4 only)")
    args = ap.parse_args()

    print(f"=== Model Optimizer PTQ ===")
    print(f"  Model:    {args.model}")
    print(f"  QFormat:  {args.qformat}")
    print(f"  KV Cache: {args.kv_cache_qformat}")
    print(f"  TP:       {args.tp}")
    print(f"  Calib:    {args.calib_size} samples, seq_len={args.calib_seq}")
    print()

    # Seed everything
    random.seed(1234)
    np.random.seed(1234)
    torch.manual_seed(1234)

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        args.model,
        trust_remote_code=args.trust_remote_code,
        padding_side="left",
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model
    print("Loading model...")
    model_kwargs = {
        "trust_remote_code": args.trust_remote_code,
        "torch_dtype": torch.bfloat16,
    }
    if args.use_seq_device_map:
        model_kwargs["device_map"] = "auto"

    model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)

    if not args.use_seq_device_map:
        model = model.cuda()

    # Build calibration dataloader
    print("Building calibration dataset...")
    calib_dataloader = create_forward_loop(
        model,
        dataloader=get_dataloader(
            tokenizer=tokenizer,
            calib_size=args.calib_size,
            batch_size=args.batch_size,
            calib_seq=args.calib_seq,
        ),
    )

    # Quantize
    quant_cfg = QUANT_CONFIGS[args.qformat]
    print(f"Running PTQ with {args.qformat}...")
    t0 = time.time()

    model = mtq.quantize(model, quant_cfg, calib_dataloader)

    elapsed = time.time() - t0
    print(f"Quantization complete in {elapsed/60:.1f} min")

    # Export
    print(f"Exporting to {args.export_dir} ...")
    with torch.inference_mode():
        export_hf_checkpoint(
            model,
            args.export_dir,
            tokenizer=tokenizer,
            export_tensorrt_llm_plugins=True,
        )

    print(f"Done. Output at {args.export_dir}")


def get_dataloader(tokenizer, calib_size, batch_size, calib_seq):
    """Create calibration dataloader using modelopt's built-in dataset utils."""
    from modelopt.torch.utils.dataset_utils import get_dataset_dataloader

    return get_dataset_dataloader(
        tokenizer=tokenizer,
        num_samples=calib_size[0],
        batch_size=batch_size,
        seq_len=calib_seq,
    )


if __name__ == "__main__":
    main()