Cleanup: nuke dead scripts and stale docs, rewrite README for full NVFP4 pipeline

2026-05-08 17:02:07 +00:00
parent 075da675dc
commit eeba101cc4
5 changed files with 31 additions and 356 deletions
--- a/scripts/model_opt_nvfp4_experts_only.py
+++ b/scripts/model_opt_nvfp4_experts_only.py
@@ -1,65 +0,0 @@
-#!/usr/bin/env python3
-"""
-ModelOpt NVFP4 quantization — experts only.
-
-Quantizes only the MoE expert weights (gate_up_proj, down_proj) to NVFP4,
-leaving attention and shared MLP layers untouched. This avoids issues with
-FP8 attention kernels on Blackwell (DeepGEMM unsupported, Triton finegrained
-FP8 matmul shape mismatches).
-
-Available NVFP4 quantization strategies (from modelopt huggingface_example.sh):
-  - nvfp4               : Full model NVFP4 quantization
-  - nvfp4_experts_only  : Only MoE expert weights (this script)
-  - nvfp4_mlp_only      : Only MLP layers (experts + shared MLP)
-  - nvfp4_omlp_only     : Only output + MLP layers
-  - nvfp4_awq           : NVFP4 with AWQ calibration
-  - nvfp4_mse           : NVFP4 with MSE calibration
-  - w4a8_nvfp4_fp8      : W4A8 NVFP4 weights + FP8 activations
-  - w4a8_mxfp4_fp8      : W4A8 MXFP4 weights + FP8 activations
-  - nvfp4_svdquant      : NVFP4 with SVDQuant
-  - nvfp4_local_hessian : NVFP4 with local Hessian calibration
-
-Strategy: Copy this file to model_opt_nvfp4_<strategy>.py and tweak as needed.
-By the end, we'll have working quantized weights for each successful strategy.
-
-Output dir naming: DeepSeek-V4-Pro_NVFP4-<strategy>_kv_fp8_cast
-"""
-
-import subprocess
-import sys
-import os
-
-# ── Config ──────────────────────────────────────────────────────────────────
-MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"  # Dequantized BF16 (from scripts/dequant_fp8_to_bf16.py)
-QUANT = "nvfp4_experts_only"
-TP = 8
-CALIB = 256
-KV_CACHE_QUANT = "fp8_cast"
-EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map"
-
-# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
-# We override the model name to make the strategy clear
-OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}"
-
-SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
-LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
-
-# ── Run ─────────────────────────────────────────────────────────────────────
-cmd = f"""cd {SCRIPT_DIR} && \\
-source /root/nvidia-meeting/venv/bin/activate && \\
-PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
-bash scripts/huggingface_example.sh \\
-    --model {MODEL} \\
-    --quant {QUANT} \\
-    --tp {TP} \\
-    --calib {CALIB} \\
-    --kv_cache_quant {KV_CACHE_QUANT} \\
-    {EXTRA_FLAGS} 2>&1 | tee {LOG_FILE}"""
-
-print(f"Running: {QUANT} quantization on {MODEL}")
-print(f"Output: {OUTPUT_NAME}")
-print(f"Log: {LOG_FILE}")
-print(f"Command:\n{cmd}\n")
-
-ret = subprocess.call(cmd, shell=True)
-sys.exit(ret)
--- a/scripts/run_modelopt_nvfp4.sh
+++ b/scripts/run_modelopt_nvfp4.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-# DeepSeek V4 Pro FP8 → NVFP4 via NVIDIA ModelOpt
-# Run from: /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
-#
-# Prerequisites:
-#   - modelopt 0.45.0+ from git: pip install "nvidia-modelopt[hf] @ git+https://github.com/NVIDIA/Model-Optimizer.git"
-#   - transformers 5.8.0.dev0: pip install git+https://github.com/huggingface/transformers.git
-#   - kernels: pip install -U kernels
-#   - Patch modelopt: cp patches/quant_module_patched.py <venv>/lib/python3.10/site-packages/modelopt/torch/quantization/nn/modules/quant_module.py
-#
-# Source weights: /root/nvidia-meeting/DeepSeek-V4-Pro-FP8
-
-set -e
-cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
-source /root/nvidia-meeting/venv/bin/activate
-
-PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
-bash scripts/huggingface_example.sh \
-    --model /root/nvidia-meeting/DeepSeek-V4-Pro-FP8 \
-    --quant nvfp4 \
-    --tp 8 \
-    --calib 256 \
-    --kv_cache_quant fp8_cast \
-    --trust_remote_code \
-    --use_seq_device_map