Add full nvfp4 quantization script + complete dequant script

- model_opt_nvfp4_full.py: Full NVFP4 quantization (not experts-only) Uses --gpu_max_mem_percentage 0.9 instead of --use_seq_device_map - dequant_fp8_to_bf16.py: Now handles INT4-packed experts + FP8 shared experts + FP8 attention. Complete dequant to pure BF16.
2026-05-08 01:50:53 +00:00
parent db6beb5b76
commit b5d569218c
1 changed files with 67 additions and 0 deletions
--- a/scripts/model_opt_nvfp4_full.py
+++ b/scripts/model_opt_nvfp4_full.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""
+ModelOpt NVFP4 quantization — full model.
+
+Quantizes ALL weights (attention + experts + shared MLP) to NVFP4.
+Requires a pure BF16 source model (from scripts/dequant_fp8_to_bf16.py)
+to avoid FP8/INT4 kernel issues on Blackwell GPUs.
+
+Available NVFP4 quantization strategies (from modelopt huggingface_example.sh):
+  - nvfp4               : Full model NVFP4 quantization (this script)
+  - nvfp4_experts_only  : Only MoE expert weights
+  - nvfp4_mlp_only      : Only MLP layers (experts + shared MLP)
+  - nvfp4_omlp_only     : Only output + MLP layers
+  - nvfp4_awq           : NVFP4 with AWQ calibration
+  - nvfp4_mse           : NVFP4 with MSE calibration
+  - w4a8_nvfp4_fp8      : W4A8 NVFP4 weights + FP8 activations
+  - w4a8_mxfp4_fp8      : W4A8 MXFP4 weights + FP8 activations
+  - nvfp4_svdquant      : NVFP4 with SVDQuant
+  - nvfp4_local_hessian : NVFP4 with local Hessian calibration
+
+Strategy: Copy this file to model_opt_nvfp4_<strategy>.py and tweak as needed.
+By the end, we'll have working quantized weights for each successful strategy.
+
+Output dir naming: DeepSeek-V4-Pro_NVFP4-<strategy>_kv_fp8_cast
+"""
+
+import subprocess
+import sys
+import os
+
+# ── Config ──────────────────────────────────────────────────────────────────
+MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"  # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py)
+QUANT = "nvfp4"
+TP = 8
+CALIB = 256
+KV_CACHE_QUANT = "fp8_cast"
+
+# No --use_seq_device_map (causes OOM on 2.8TB RAM with 782GB+ model)
+# Use gpu_max_mem_percentage to keep model on GPU, reduce CPU RAM pressure
+EXTRA_FLAGS = "--trust_remote_code --gpu_max_mem_percentage 0.9"
+
+# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
+# We override the model name to make the strategy clear
+OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}"
+
+SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
+LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
+
+# ── Run ─────────────────────────────────────────────────────────────────────
+cmd = f"""cd {SCRIPT_DIR} && \\
+source /root/nvidia-meeting/venv/bin/activate && \\
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
+bash scripts/huggingface_example.sh \\
+    --model {MODEL} \\
+    --quant {QUANT} \\
+    --tp {TP} \\
+    --calib {CALIB} \\
+    --kv_cache_quant {KV_CACHE_QUANT} \\
+    {EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}"""
+
+print(f"Running: {QUANT} quantization on {MODEL}")
+print(f"Output: {OUTPUT_NAME}")
+print(f"Log: {LOG_FILE}")
+print(f"Command:\n{cmd}\n")
+
+ret = subprocess.call(cmd, shell=True)
+sys.exit(ret)