Cleanup: nuke dead scripts and stale docs, rewrite README for full NVFP4 pipeline
This commit is contained in:
@@ -1,65 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ModelOpt NVFP4 quantization — experts only.
|
||||
|
||||
Quantizes only the MoE expert weights (gate_up_proj, down_proj) to NVFP4,
|
||||
leaving attention and shared MLP layers untouched. This avoids issues with
|
||||
FP8 attention kernels on Blackwell (DeepGEMM unsupported, Triton finegrained
|
||||
FP8 matmul shape mismatches).
|
||||
|
||||
Available NVFP4 quantization strategies (from modelopt huggingface_example.sh):
|
||||
- nvfp4 : Full model NVFP4 quantization
|
||||
- nvfp4_experts_only : Only MoE expert weights (this script)
|
||||
- nvfp4_mlp_only : Only MLP layers (experts + shared MLP)
|
||||
- nvfp4_omlp_only : Only output + MLP layers
|
||||
- nvfp4_awq : NVFP4 with AWQ calibration
|
||||
- nvfp4_mse : NVFP4 with MSE calibration
|
||||
- w4a8_nvfp4_fp8 : W4A8 NVFP4 weights + FP8 activations
|
||||
- w4a8_mxfp4_fp8 : W4A8 MXFP4 weights + FP8 activations
|
||||
- nvfp4_svdquant : NVFP4 with SVDQuant
|
||||
- nvfp4_local_hessian : NVFP4 with local Hessian calibration
|
||||
|
||||
Strategy: Copy this file to model_opt_nvfp4_<strategy>.py and tweak as needed.
|
||||
By the end, we'll have working quantized weights for each successful strategy.
|
||||
|
||||
Output dir naming: DeepSeek-V4-Pro_NVFP4-<strategy>_kv_fp8_cast
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16" # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py)
|
||||
QUANT = "nvfp4_experts_only"
|
||||
TP = 8
|
||||
CALIB = 256
|
||||
KV_CACHE_QUANT = "fp8_cast"
|
||||
EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map"
|
||||
|
||||
# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
|
||||
# We override the model name to make the strategy clear
|
||||
OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}"
|
||||
|
||||
SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
|
||||
LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
|
||||
|
||||
# ── Run ─────────────────────────────────────────────────────────────────────
|
||||
cmd = f"""cd {SCRIPT_DIR} && \\
|
||||
source /root/nvidia-meeting/venv/bin/activate && \\
|
||||
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
|
||||
bash scripts/huggingface_example.sh \\
|
||||
--model {MODEL} \\
|
||||
--quant {QUANT} \\
|
||||
--tp {TP} \\
|
||||
--calib {CALIB} \\
|
||||
--kv_cache_quant {KV_CACHE_QUANT} \\
|
||||
{EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}"""
|
||||
|
||||
print(f"Running: {QUANT} quantization on {MODEL}")
|
||||
print(f"Output: {OUTPUT_NAME}")
|
||||
print(f"Log: {LOG_FILE}")
|
||||
print(f"Command:\n{cmd}\n")
|
||||
|
||||
ret = subprocess.call(cmd, shell=True)
|
||||
sys.exit(ret)
|
||||
@@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
# DeepSeek V4 Pro FP8 → NVFP4 via NVIDIA ModelOpt
|
||||
# Run from: /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
|
||||
#
|
||||
# Prerequisites:
|
||||
# - modelopt 0.45.0+ from git: pip install "nvidia-modelopt[hf] @ git+https://github.com/NVIDIA/Model-Optimizer.git"
|
||||
# - transformers 5.8.0.dev0: pip install git+https://github.com/huggingface/transformers.git
|
||||
# - kernels: pip install -U kernels
|
||||
# - Patch modelopt: cp patches/quant_module_patched.py <venv>/lib/python3.10/site-packages/modelopt/torch/quantization/nn/modules/quant_module.py
|
||||
#
|
||||
# Source weights: /root/nvidia-meeting/DeepSeek-V4-Pro-FP8
|
||||
|
||||
set -e
|
||||
cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
|
||||
source /root/nvidia-meeting/venv/bin/activate
|
||||
|
||||
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
||||
bash scripts/huggingface_example.sh \
|
||||
--model /root/nvidia-meeting/DeepSeek-V4-Pro-FP8 \
|
||||
--quant nvfp4 \
|
||||
--tp 8 \
|
||||
--calib 256 \
|
||||
--kv_cache_quant fp8_cast \
|
||||
--trust_remote_code \
|
||||
--use_seq_device_map
|
||||
Reference in New Issue
Block a user