#!/usr/bin/env python3 """ ModelOpt NVFP4 quantization — full model. Quantizes ALL weights (attention + experts + shared MLP) to NVFP4. Requires a pure BF16 source model (from scripts/dequant_fp8_to_bf16.py) to avoid FP8/FP4 kernel issues on Blackwell GPUs. Available NVFP4 quantization strategies (from modelopt huggingface_example.sh): - nvfp4 : Full model NVFP4 quantization (this script) - nvfp4_experts_only : Only MoE expert weights - nvfp4_mlp_only : Only MLP layers (experts + shared MLP) - nvfp4_omlp_only : Only output + MLP layers - nvfp4_awq : NVFP4 with AWQ calibration - nvfp4_mse : NVFP4 with MSE calibration - w4a8_nvfp4_fp8 : W4A8 NVFP4 weights + FP8 activations - w4a8_mxfp4_fp8 : W4A8 MXFP4 weights + FP8 activations - nvfp4_svdquant : NVFP4 with SVDQuant - nvfp4_local_hessian : NVFP4 with local Hessian calibration Strategy: Copy this file to model_opt_nvfp4_.py and tweak as needed. By the end, we'll have working quantized weights for each successful strategy. Output dir naming: DeepSeek-V4-Pro_NVFP4-_kv_fp8_cast """ import subprocess import sys import os # ── Config ────────────────────────────────────────────────────────────────── MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16" # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py) QUANT = "nvfp4" TP = 8 CALIB = 128 KV_CACHE_QUANT = "fp8_cast" # 3TB BF16 model can't fit on 8×B200 VRAM (~1.4TB total) # Use seq_device_map: loads model into CPU RAM, moves layers to GPU for forward passes # 2.8TB RAM is enough for the 3TB model (with memory-mapped loading) EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.7" # HF token for gated calibration datasets (nvidia/Nemotrons-Post-Training-Dataset-v2) HF_TOKEN = "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" # Output dir follows modelopt convention: __kv_ # We override the model name to make the strategy clear OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}" SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq" LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log" # ── Run ───────────────────────────────────────────────────────────────────── cmd = f"""cd {SCRIPT_DIR} && \\ . /root/nvidia-meeting/venv/bin/activate && \\ HF_TOKEN={HF_TOKEN} \\ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\ bash scripts/huggingface_example.sh \\ --model {MODEL} \\ --quant {QUANT} \\ --tp {TP} \\ --calib {CALIB} \\ --kv_cache_quant {KV_CACHE_QUANT} \\ {EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}""" print(f"Running: {QUANT} quantization on {MODEL}") print(f"Output: {OUTPUT_NAME}") print(f"Log: {LOG_FILE}") print(f"Command:\n{cmd}\n") ret = subprocess.call(cmd, shell=True) sys.exit(ret)