deepseek-v4-quant/scripts/run_modelopt_nvfp4.sh

#!/bin/bash
# DeepSeek V4 Pro FP8 → NVFP4 via NVIDIA ModelOpt
# Run from: /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
#
# Prerequisites:
#   - modelopt 0.45.0+ from git: pip install "nvidia-modelopt[hf] @ git+https://github.com/NVIDIA/Model-Optimizer.git"
#   - transformers 5.8.0.dev0: pip install git+https://github.com/huggingface/transformers.git
#   - kernels: pip install -U kernels
#   - Patch modelopt: cp patches/quant_module_patched.py <venv>/lib/python3.10/site-packages/modelopt/torch/quantization/nn/modules/quant_module.py
#
# Source weights: /root/nvidia-meeting/DeepSeek-V4-Pro-FP8

set -e
cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
source /root/nvidia-meeting/venv/bin/activate

PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
bash scripts/huggingface_example.sh \
    --model /root/nvidia-meeting/DeepSeek-V4-Pro-FP8 \
    --quant nvfp4 \
    --tp 8 \
    --calib 256 \
    --kv_cache_quant fp8_cast \
    --trust_remote_code \
    --use_seq_device_map