diff --git a/scripts/model_opt_nvfp4_full.py b/scripts/model_opt_nvfp4_full.py new file mode 100644 index 0000000..8934485 --- /dev/null +++ b/scripts/model_opt_nvfp4_full.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +ModelOpt NVFP4 quantization — full model. + +Quantizes ALL weights (attention + experts + shared MLP) to NVFP4. +Requires a pure BF16 source model (from scripts/dequant_fp8_to_bf16.py) +to avoid FP8/INT4 kernel issues on Blackwell GPUs. + +Available NVFP4 quantization strategies (from modelopt huggingface_example.sh): + - nvfp4 : Full model NVFP4 quantization (this script) + - nvfp4_experts_only : Only MoE expert weights + - nvfp4_mlp_only : Only MLP layers (experts + shared MLP) + - nvfp4_omlp_only : Only output + MLP layers + - nvfp4_awq : NVFP4 with AWQ calibration + - nvfp4_mse : NVFP4 with MSE calibration + - w4a8_nvfp4_fp8 : W4A8 NVFP4 weights + FP8 activations + - w4a8_mxfp4_fp8 : W4A8 MXFP4 weights + FP8 activations + - nvfp4_svdquant : NVFP4 with SVDQuant + - nvfp4_local_hessian : NVFP4 with local Hessian calibration + +Strategy: Copy this file to model_opt_nvfp4_.py and tweak as needed. +By the end, we'll have working quantized weights for each successful strategy. + +Output dir naming: DeepSeek-V4-Pro_NVFP4-_kv_fp8_cast +""" + +import subprocess +import sys +import os + +# ── Config ────────────────────────────────────────────────────────────────── +MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16" # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py) +QUANT = "nvfp4" +TP = 8 +CALIB = 256 +KV_CACHE_QUANT = "fp8_cast" + +# No --use_seq_device_map (causes OOM on 2.8TB RAM with 782GB+ model) +# Use gpu_max_mem_percentage to keep model on GPU, reduce CPU RAM pressure +EXTRA_FLAGS = "--trust_remote_code --gpu_max_mem_percentage 0.9" + +# Output dir follows modelopt convention: __kv_ +# We override the model name to make the strategy clear +OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}" + +SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq" +LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log" + +# ── Run ───────────────────────────────────────────────────────────────────── +cmd = f"""cd {SCRIPT_DIR} && \\ +source /root/nvidia-meeting/venv/bin/activate && \\ +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\ +bash scripts/huggingface_example.sh \\ + --model {MODEL} \\ + --quant {QUANT} \\ + --tp {TP} \\ + --calib {CALIB} \\ + --kv_cache_quant {KV_CACHE_QUANT} \\ + {EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}""" + +print(f"Running: {QUANT} quantization on {MODEL}") +print(f"Output: {OUTPUT_NAME}") +print(f"Log: {LOG_FILE}") +print(f"Command:\n{cmd}\n") + +ret = subprocess.call(cmd, shell=True) +sys.exit(ret)