76 lines
3.2 KiB
Python
76 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
ModelOpt NVFP4 quantization — full model.
|
||
|
||
Quantizes ALL weights (attention + experts + shared MLP) to NVFP4.
|
||
Requires a pure BF16 source model (from scripts/dequant_fp8_to_bf16.py)
|
||
to avoid FP8/FP4 kernel issues on Blackwell GPUs.
|
||
|
||
Available NVFP4 quantization strategies (from modelopt huggingface_example.sh):
|
||
- nvfp4 : Full model NVFP4 quantization (this script)
|
||
- nvfp4_experts_only : Only MoE expert weights
|
||
- nvfp4_mlp_only : Only MLP layers (experts + shared MLP)
|
||
- nvfp4_omlp_only : Only output + MLP layers
|
||
- nvfp4_awq : NVFP4 with AWQ calibration
|
||
- nvfp4_mse : NVFP4 with MSE calibration
|
||
- w4a8_nvfp4_fp8 : W4A8 NVFP4 weights + FP8 activations
|
||
- w4a8_mxfp4_fp8 : W4A8 MXFP4 weights + FP8 activations
|
||
- nvfp4_svdquant : NVFP4 with SVDQuant
|
||
- nvfp4_local_hessian : NVFP4 with local Hessian calibration
|
||
|
||
Strategy: Copy this file to model_opt_nvfp4_<strategy>.py and tweak as needed.
|
||
By the end, we'll have working quantized weights for each successful strategy.
|
||
|
||
Output dir naming: DeepSeek-V4-Pro_NVFP4-<strategy>_kv_fp8_cast
|
||
"""
|
||
|
||
import subprocess
|
||
import sys
|
||
import os
|
||
|
||
# ── Config ──────────────────────────────────────────────────────────────────
|
||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16" # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py)
|
||
QUANT = "nvfp4"
|
||
TP = 8
|
||
CALIB = 128
|
||
KV_CACHE_QUANT = "fp8_cast"
|
||
|
||
# 3TB BF16 model can't fit on 8×B200 VRAM (~1.4TB total)
|
||
# Use seq_device_map: loads model into CPU RAM, moves layers to GPU for forward passes
|
||
# 2.8TB RAM is enough for the 3TB model (with memory-mapped loading)
|
||
EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.7"
|
||
|
||
# HF token for gated calibration datasets (nvidia/Nemotron-Post-Training-Dataset-v2)
|
||
HF_TOKEN = "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"
|
||
|
||
# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
|
||
# We override the model name to make the strategy clear
|
||
OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}"
|
||
|
||
SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
|
||
LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
|
||
|
||
# ── Run ─────────────────────────────────────────────────────────────────────
|
||
cmd = f"""cd {SCRIPT_DIR} && \\
|
||
. /root/nvidia-meeting/venv/bin/activate && \\
|
||
export HF_TOKEN={HF_TOKEN} && \\
|
||
export HUGGING_FACE_HUB_TOKEN={HF_TOKEN} && \\
|
||
echo "HF_TOKEN=$HF_TOKEN" && \\
|
||
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
|
||
bash scripts/huggingface_example.sh \\
|
||
--model {MODEL} \\
|
||
--quant {QUANT} \\
|
||
--tp {TP} \\
|
||
--calib {CALIB} \\
|
||
--kv_cache_quant {KV_CACHE_QUANT} \\
|
||
{EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}"""
|
||
|
||
print(f"Running: {QUANT} quantization on {MODEL}")
|
||
print(f"Output: {OUTPUT_NAME}")
|
||
print(f"Log: {LOG_FILE}")
|
||
print(f"HF_TOKEN: {HF_TOKEN}")
|
||
print(f"Command:\n{cmd}\n")
|
||
|
||
ret = subprocess.call(cmd, shell=True)
|
||
sys.exit(ret)
|