deepseek-v4-quant/scripts/model_opt_nvfp4_experts_only.py

#!/usr/bin/env python3
"""
ModelOpt NVFP4 quantization — experts only.

Quantizes only the MoE expert weights (gate_up_proj, down_proj) to NVFP4,
leaving attention and shared MLP layers untouched. This avoids issues with
FP8 attention kernels on Blackwell (DeepGEMM unsupported, Triton finegrained
FP8 matmul shape mismatches).

Available NVFP4 quantization strategies (from modelopt huggingface_example.sh):
  - nvfp4               : Full model NVFP4 quantization
  - nvfp4_experts_only  : Only MoE expert weights (this script)
  - nvfp4_mlp_only      : Only MLP layers (experts + shared MLP)
  - nvfp4_omlp_only     : Only output + MLP layers
  - nvfp4_awq           : NVFP4 with AWQ calibration
  - nvfp4_mse           : NVFP4 with MSE calibration
  - w4a8_nvfp4_fp8      : W4A8 NVFP4 weights + FP8 activations
  - w4a8_mxfp4_fp8      : W4A8 MXFP4 weights + FP8 activations
  - nvfp4_svdquant      : NVFP4 with SVDQuant
  - nvfp4_local_hessian : NVFP4 with local Hessian calibration

Strategy: Copy this file to model_opt_nvfp4_<strategy>.py and tweak as needed.
By the end, we'll have working quantized weights for each successful strategy.

Output dir naming: DeepSeek-V4-Pro_NVFP4-<strategy>_kv_fp8_cast
"""

import subprocess
import sys
import os

# ── Config ──────────────────────────────────────────────────────────────────
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"  # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py)
QUANT = "nvfp4_experts_only"
TP = 8
CALIB = 256
KV_CACHE_QUANT = "fp8_cast"
EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map"

# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
# We override the model name to make the strategy clear
OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}"

SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"

# ── Run ─────────────────────────────────────────────────────────────────────
cmd = f"""cd {SCRIPT_DIR} && \\
source /root/nvidia-meeting/venv/bin/activate && \\
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
bash scripts/huggingface_example.sh \\
    --model {MODEL} \\
    --quant {QUANT} \\
    --tp {TP} \\
    --calib {CALIB} \\
    --kv_cache_quant {KV_CACHE_QUANT} \\
    {EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}"""

print(f"Running: {QUANT} quantization on {MODEL}")
print(f"Output: {OUTPUT_NAME}")
print(f"Log: {LOG_FILE}")
print(f"Command:\n{cmd}\n")

ret = subprocess.call(cmd, shell=True)
sys.exit(ret)