deepseek-v4-quant/scripts/quantize_nvfp4.py

#!/usr/bin/env python3
"""
DeepSeek V4 Pro → NVFP4 quantization — defensive edition.

Runs the full ModelOpt PTQ pipeline with maximum protection against GPU tensor
corruption that crashes the export after 6 hours of calibration.

Key defense: immediately after calibration, every quantizer _amax tensor is
snapshotted to CPU. Then the model state is saved to disk. If export crashes,
the state can be reloaded and export retried without re-calibrating.

The _amax tensors are tiny (scalars and small vectors). Snapshotting ~49K of them
to CPU costs almost nothing in memory and guarantees we have valid calibration
data regardless of what CUDA does to the GPU copies afterward.

Must be run from the modelopt example directory for imports:
    cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py

Usage:
    # Full run (calibrate + export):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py

    # Re-run export only (after a calibration save exists):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only

    # Validate saved calibration state (check amax values):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --validate-only
"""

import argparse
import copy
import gc
import os
import sys
import time
import warnings

import torch

# ── Config ──────────────────────────────────────────────────────────────────

MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"
QUANT = "nvfp4"
TP = 8
CALIB_SIZE = 128
CALIB_SEQ = 512
KV_CACHE_QUANT = "fp8_cast"
GPU_MEM_PCT = 0.7

HF_TOKEN = "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"

# Paths
EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
AMAX_SNAPSHOT_PATH = "/root/nvidia-meeting/v4_nvfp4_amax_snapshots.pt"


def apply_patches():
    """Apply runtime patches for V4 compatibility and GPU tensor safety."""

    from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module

    # ── Patch 1: load_calib_amax — force _amax to CPU after calibration ──
    #
    # load_calib_amax() is called by max_calibrate() after the forward loop
    # finishes. It writes _amax to GPU by default. We patch it so _amax
    # goes to CPU instead, preventing GPU corruption during the long wait
    # before export.
    orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax

    def patched_load_calib_amax(self, *args, **kwargs):
        orig_load_calib_amax(self, *args, **kwargs)
        # After _amax is written, move it to CPU
        if hasattr(self, '_amax') and self._amax is not None:
            self._amax = self._amax.cpu()

    tq_module.TensorQuantizer.load_calib_amax = patched_load_calib_amax
    print("✓ Patched TensorQuantizer.load_calib_amax (force _amax to CPU)")

    # ── Patch 2: export_amax — CPU safety ──
    # If any _amax is still on GPU at export time, move it before reading.
    orig_export_amax = tq_module.TensorQuantizer.export_amax

    def patched_export_amax(self):
        if self.amax is not None and self.amax.is_cuda:
            self._amax = self._amax.cpu()
        return orig_export_amax(self)

    tq_module.TensorQuantizer.export_amax = patched_export_amax
    print("✓ Patched TensorQuantizer.export_amax (CPU fallback)")

    # ── Patch 3: NVFP4QTensor.get_activation_scaling_factor — graceful degradation ──
    from modelopt.torch.quantization.qtensor import nvfp4_tensor

    @classmethod
    def patched_get_activation_scaling_factor(cls, quantizer):
        if not quantizer.is_enabled:
            return None
        try:
            amax = quantizer.export_amax()
        except (torch.cuda.CudaError, RuntimeError) as e:
            print(f"  WARNING: export_amax() failed ({e}), attempting CPU recovery...")
            if hasattr(quantizer, '_amax') and quantizer._amax is not None:
                quantizer._amax = quantizer._amax.cpu()
            amax = quantizer.export_amax()

        if amax is None:
            return None
        amax = amax.cpu()
        activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0)

        # Clamp instead of hard assert — bad values from GPU corruption should
        # not kill the entire 6-hour run
        if not torch.all(activation_scaling_factor > 0):
            n_bad = (activation_scaling_factor <= 0).sum().item()
            n_total = activation_scaling_factor.numel()
            print(f"  WARNING: {n_bad}/{n_total} activation scaling factors <= 0, clamping")
            activation_scaling_factor = activation_scaling_factor.clamp(min=torch.finfo(torch.float32).tiny)

        return activation_scaling_factor

    nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor
    print("✓ Patched NVFP4QTensor.get_activation_scaling_factor (CPU + clamp)")


def snapshot_amax_to_cpu(model, snapshot_path):
    """Walk all quantizers, copy their _amax to CPU, save to disk.

    This is the core defensive measure. After calibration completes, the _amax
    tensors are fresh and valid on GPU. We copy them to CPU immediately and
    save to disk. This costs almost nothing (~50MB for ~49K quantizers) but
    guarantees we have valid calibration data even if CUDA corrupts the GPU
    copies later.

    Returns the snapshot dict: {quantizer_name: amax_tensor_on_cpu}
    """
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    print(f"\nSnapshotting quantizer _amax to CPU...")
    t0 = time.time()
    snapshots = {}
    n_moved = 0

    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        if hasattr(module, '_amax') and module._amax is not None:
            # Copy to CPU immediately
            amax_cpu = module._amax.detach().cpu().clone()
            snapshots[name] = amax_cpu
            # Replace the GPU copy with the CPU copy
            module._amax.data.copy_(amax_cpu)
            n_moved += 1

    # Save snapshots to disk
    torch.save(snapshots, snapshot_path)
    size_mb = os.path.getsize(snapshot_path) / (1024**2)
    print(f"✓ Snapshotted {n_moved} quantizer _amax tensors to CPU ({time.time()-t0:.1f}s)")
    print(f"  Saved to: {snapshot_path} ({size_mb:.1f} MB)")

    return snapshots


def restore_amax_from_snapshot(model, snapshot_path):
    """Restore _amax from a previously saved CPU snapshot.

    Used by --export-only to guarantee valid amax values even if the
    model state dict has corrupted GPU tensors.
    """
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    print(f"Restoring _amax from snapshot: {snapshot_path}")
    snapshots = torch.load(snapshot_path, map_location='cpu')
    n_restored = 0

    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        if name in snapshots and hasattr(module, '_amax'):
            module._amax.data.copy_(snapshots[name].to(module._amax.device))
            n_restored += 1

    print(f"✓ Restored {n_restored} _amax tensors from snapshot")


def force_all_amax_to_cpu(model):
    """Force ALL quantizer tensors to CPU. Nuclear option after calibration.

    After calling this, no quantizer _amax lives on GPU. Export can't hit
    CUDA illegal memory access because there's nothing on GPU to corrupt.
    """
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    count = 0
    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        for attr in ['_amax', '_pre_quant_scale', '_global_amax']:
            if hasattr(module, attr):
                val = getattr(module, attr)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    setattr(module, attr, val.cpu())
                    count += 1
    print(f"✓ Forced {count} quantizer tensors to CPU")


def save_calibrated_state(model, path):
    """Save model state dict after calibration.

    The insurance policy: if export crashes, we can reload and retry
    without re-running 6 hours of calibration.
    """
    print(f"\n{'='*60}")
    print(f"SAVING CALIBRATED STATE → {path}")
    print(f"{'='*60}")

    start = time.time()

    # All quantizer state should already be on CPU from snapshot_amax_to_cpu
    state = {
        'model_state_dict': model.state_dict(),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    }

    torch.save(state, path)
    size_gb = os.path.getsize(path) / (1024**3)
    print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)")
    print(f"  Path: {path}")
    print(f"  Re-run with --export-only to retry export.\n")


def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, calib_size, calib_seq):
    """Full pipeline: load → quantize → calibrate → snapshot → save → export."""

    global _FORCE_AMAX_CPU

    os.chdir(EXAMPLE_DIR)
    sys.path.insert(0, EXAMPLE_DIR)

    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

    from hf_ptq import (
        get_model as modelopt_get_model, get_tokenizer, make_calib_dataloader,
        build_quant_cfg, load_mtp_weights, copy_custom_model_files,
        QUANT_CFG_CHOICES,
    )
    from modelopt.torch import quantization as mtq
    from modelopt.torch.quantization.config import need_calibration
    from modelopt.torch.utils.dataset_utils import get_max_batch_size
    from modelopt.torch.export import export_hf_checkpoint
    from transformers import AutoTokenizer

    apply_patches()

    # ── Load model ──
    # Use modelopt's get_model() instead of raw AutoModelForCausalLM.from_pretrained.
    # The raw call OOMs during weight conversion (torch.cat on experts needs 31.5GB,
    # only 25.9GB free). modelopt's loader handles max_memory/device_map properly.
    print(f"\nLoading model from {model_path}...")
    t0 = time.time()

    model = modelopt_get_model(
        model_path,
        gpu_mem_percentage=GPU_MEM_PCT,
        trust_remote_code=True,
        use_seq_device_map=True,
    )
    tokenizer = get_tokenizer(model_path, trust_remote_code=True)
    print(f"✓ Model loaded in {time.time()-t0:.0f}s")

    # ── Setup quantization config ──
    quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[QUANT])
    quant_cfg = build_quant_cfg(QUANT, quant_cfg, None, None, None)

    if KV_CACHE_QUANT != "none":
        quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
            quant_cfg,
            getattr(mtq, mtq.KV_QUANT_CFG_CHOICES[KV_CACHE_QUANT])["quant_cfg"],
        )
        print(f"✓ KV cache quantization: {KV_CACHE_QUANT}")

    # ── Detect batch size ──
    print("\nDetecting max calibration batch size...")
    batch_size = get_max_batch_size(
        model,
        max_sample_length=calib_seq,
        sample_memory_usage_ratio=1.1,
    )
    batch_size = min(batch_size, calib_size)
    print(f"✓ Using calibration batch_size={batch_size}")

    # ── Prepare dataloader ──
    args = argparse.Namespace(
        calib_size=[calib_size],
        calib_seq=calib_seq,
        calib_dataset="",
        batch_size=batch_size,
        calib_batch_size=0,
    )
    calib_dataloader, _ = make_calib_dataloader(
        args, model, None, tokenizer, torch.device("cuda"), None,
    )

    # ── Quantize + Calibrate ──
    print(f"\n{'='*60}")
    print(f"QUANTIZING: {QUANT} with {calib_size} calibration samples")
    print(f"{'='*60}")
    t0 = time.time()

    # _FORCE_AMAX_CPU is False during calibration — amax stays on GPU for
    # fake quantization during the forward passes
    model = mtq.quantize(model, quant_cfg, forward_loop=calib_dataloader)

    print(f"✓ Quantization + calibration complete in {time.time()-t0:.0f}s")

    # ── IMMEDIATELY snapshot all _amax to CPU ──
    # This is the critical defensive step. Right after mtq.quantize() returns,
    # the _amax tensors are fresh and valid on GPU. We copy them to CPU NOW,
    # before any other GPU operation has a chance to corrupt them.
    snapshots = snapshot_amax_to_cpu(model, amax_snapshot_path)

    # ── Force ALL quantizer state to CPU ──
    # After snapshotting, force remaining GPU tensors to CPU too
    force_all_amax_to_cpu(model)

    # ── Force ALL quantizer state to CPU ──
    torch.cuda.empty_cache()
    gc.collect()

    # ── SAVE STATE ──
    save_calibrated_state(model, calib_save_path)

    # ── Export ──
    run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path)


def run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path=None):
    """Export the quantized model to HF safetensors format."""
    from modelopt.torch.export import export_hf_checkpoint
    from hf_ptq import load_mtp_weights, copy_custom_model_files

    print(f"\n{'='*60}")
    print(f"EXPORTING → {export_dir}")
    print(f"{'='*60}")

    # Ensure all quantizer state is on CPU
    force_all_amax_to_cpu(model)
    if amax_snapshot_path and os.path.exists(amax_snapshot_path):
        restore_amax_from_snapshot(model, amax_snapshot_path)

    # Free GPU memory before export
    torch.cuda.empty_cache()
    gc.collect()

    t0 = time.time()

    try:
        mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path)
        if mtp_layer_prefixes:
            model._mtp_layer_prefixes = mtp_layer_prefixes

        export_hf_checkpoint(
            model,
            export_dir=export_dir,
            extra_state_dict=mtp_state_dict,
        )

        tokenizer.save_pretrained(export_dir)
        copy_custom_model_files(model_path, export_dir, True)

        print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}")

    except Exception as e:
        print(f"\n✗ EXPORT FAILED: {e}")
        print(f"  Calibrated state: {CALIB_SAVE_PATH}")
        print(f"  Amax snapshots: {AMAX_SNAPSHOT_PATH}")
        print(f"  Re-run with --export-only to retry")
        raise


def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir):
    """Load saved calibration state and run export only."""
    os.chdir(EXAMPLE_DIR)
    sys.path.insert(0, EXAMPLE_DIR)

    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

    apply_patches()

    from hf_ptq import get_model as modelopt_get_model, get_tokenizer

    print(f"Loading model skeleton from {model_path}...")
    model = modelopt_get_model(
        model_path,
        device="cpu",
        trust_remote_code=True,
    )
    tokenizer = get_tokenizer(model_path, trust_remote_code=True)

    print(f"Loading calibrated state from {calib_save_path}...")
    state = torch.load(calib_save_path, map_location='cpu')
    model.load_state_dict(state['model_state_dict'])
    print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")

    run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path)


def run_validate(calib_save_path, amax_snapshot_path):
    """Validate saved calibration state — check amax values are valid."""
    print(f"\nValidating calibration state...")

    # Check amax snapshots
    if os.path.exists(amax_snapshot_path):
        snapshots = torch.load(amax_snapshot_path, map_location='cpu')
        n_total = len(snapshots)
        n_valid = 0
        n_zero = 0
        n_nan = 0
        n_neg = 0

        for name, amax in snapshots.items():
            if torch.any(torch.isnan(amax)):
                n_nan += 1
            elif torch.any(amax < 0):
                n_neg += 1
            elif torch.all(amax == 0):
                n_zero += 1
            else:
                n_valid += 1

        print(f"\nAmax snapshot validation:")
        print(f"  Total quantizers: {n_total}")
        print(f"  Valid: {n_valid}")
        print(f"  All zeros: {n_zero}")
        print(f"  Negative: {n_neg}")
        print(f"  NaN: {n_nan}")

        if n_valid == n_total:
            print(f"\n✓ All {n_total} amax snapshots are valid!")
        else:
            print(f"\n✗ {n_total - n_valid} quantizers have invalid amax!")
    else:
        print(f"✗ No amax snapshot found at {amax_snapshot_path}")

    # Check full state dict
    if os.path.exists(calib_save_path):
        size_gb = os.path.getsize(calib_save_path) / (1024**3)
        print(f"\nCalibrated state: {calib_save_path} ({size_gb:.1f} GB)")
    else:
        print(f"\n✗ No calibrated state found at {calib_save_path}")


def main():
    parser = argparse.ArgumentParser(description="DeepSeek V4 Pro NVFP4 Quantization")
    parser.add_argument("--export-only", action="store_true",
                        help="Skip calibration, load saved state and run export only")
    parser.add_argument("--validate-only", action="store_true",
                        help="Validate saved calibration state without running anything")
    parser.add_argument("--model", default=MODEL, help="Path to BF16 model")
    parser.add_argument("--export-dir", default=EXPORT_DIR, help="Export output directory")
    parser.add_argument("--calib-save", default=CALIB_SAVE_PATH, help="Calibration state save path")
    parser.add_argument("--amax-snapshot", default=AMAX_SNAPSHOT_PATH, help="Amax snapshot path")
    parser.add_argument("--calib-size", type=int, default=CALIB_SIZE, help="Calibration samples")
    parser.add_argument("--calib-seq", type=int, default=CALIB_SEQ, help="Calibration sequence length")
    args = parser.parse_args()

    if args.validate_only:
        run_validate(args.calib_save, args.amax_snapshot)
    elif args.export_only:
        if not os.path.exists(args.calib_save):
            print(f"ERROR: No calibration state found at {args.calib_save}")
            sys.exit(1)
        run_export_only(args.calib_save, args.amax_snapshot, args.model, args.export_dir)
    else:
        run_calibration(args.model, args.export_dir, args.calib_save,
                        args.amax_snapshot, args.calib_size, args.calib_seq)


if __name__ == "__main__":
    main()