scripts/quantize_nvfp4.py

#!/usr/bin/env python3
"""
DeepSeek V4 Pro → NVFP4 quantization — defensive edition.

This script:
1. Applies runtime patches for GPU tensor safety (before modelopt runs)
2. Calls the SAME hf_ptq.py pipeline that the shell script uses
3. After calibration, snapshots amax to CPU and saves model state

The key insight: we don't rewrite the pipeline. We let hf_ptq do its thing
with all its args, defaults, and edge cases handled correctly. We just add
our defensive patches and post-calibration saves.

Must be run from the modelopt example directory:
    cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py

Usage:
    # Full run (calibrate + export):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py

    # Re-run export only (after a calibration save exists):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only

    # Validate saved calibration state (check amax values):
    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --validate-only
"""

import argparse
import gc
import os
import sys
import time

import torch

# ── Config ──────────────────────────────────────────────────────────────────

MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"
QUANT = "nvfp4"
TP = 8
CALIB_SIZE = 128
CALIB_SEQ = 512
KV_CACHE_QUANT = "fp8_cast"
GPU_MEM_PCT = 0.7

HF_TOKEN = "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"

# Paths
EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
AMAX_SNAPSHOT_PATH = "/root/nvidia-meeting/v4_nvfp4_amax_snapshots.pt"


def apply_patches():
    """Apply runtime patches for V4 compatibility and GPU tensor safety.

    Root cause of all export crashes: use_seq_device_map keeps model weights on GPU
    for 5+ hours during calibration. By export time, CUDA's memory allocator has
    recycled the underlying memory, so any read of those GPU tensors triggers
    cudaErrorIllegalAddress.

    Fix strategy: patch at the EARLIEST possible entry points to force stale GPU
    tensors to CPU before any downstream code reads them. This covers the full
    chain of execution we traced through the export path:

    _process_quantized_modules
      → _export_quantized_weight (or _export_fused_experts)
        → get_weight_scaling_factor
          → get_weights_scaling_factor_from_quantizer (reads weight, _amax, global_amax)
          → NVFP4QTensor.get_weights_scaling_factor (dynamic: reduce_block_amax on weight)
        → get_weight_scaling_factor_2 (reads _amax, global_amax)
        → get_activation_scaling_factor (reads _amax) [already patched]
        → to_quantized_weight (reads weight, does .to(weight.device) on scaling factors)
        → weight.to(dtype) (reads weight)

    By forcing weight to CPU in Patch 4 (_export_quantized_weight), ALL downstream
    .to(weight.device) calls resolve to CPU. Patches 5-8 are belt-and-suspenders.
    """

    from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module
    from modelopt.torch.quantization.qtensor import nvfp4_tensor
    from modelopt.torch.export import quant_utils
    from modelopt.torch.quantization.utils import quantizer_attr_names as _quantizer_attr_names
    import modelopt.torch.export.unified_export_hf as uehf

    # ══════════════════════════════════════════════════════════════════════
    # Patch 1: load_calib_amax — force _amax to CPU immediately after calibration
    # This runs during calibration, right after each quantizer finishes.
    # ══════════════════════════════════════════════════════════════════════
    orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax

    def patched_load_calib_amax(self, *args, **kwargs):
        orig_load_calib_amax(self, *args, **kwargs)
        if hasattr(self, '_amax') and self._amax is not None:
            self._amax = self._amax.cpu()

    tq_module.TensorQuantizer.load_calib_amax = patched_load_calib_amax
    print("✓ Patch 1: TensorQuantizer.load_calib_amax → force _amax to CPU")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 2: export_amax — CPU safety net at export time
    # ══════════════════════════════════════════════════════════════════════
    orig_export_amax = tq_module.TensorQuantizer.export_amax

    def patched_export_amax(self):
        if hasattr(self, '_amax') and self._amax is not None and self._amax.is_cuda:
            self._amax = self._amax.cpu()
        return orig_export_amax(self)

    tq_module.TensorQuantizer.export_amax = patched_export_amax
    print("✓ Patch 2: TensorQuantizer.export_amax → CPU fallback")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 3: get_activation_scaling_factor — CPU + clamp
    # ══════════════════════════════════════════════════════════════════════
    @classmethod
    def patched_get_activation_scaling_factor(cls, quantizer):
        if not quantizer.is_enabled:
            return None
        try:
            amax = quantizer.export_amax()
        except (torch.cuda.CudaError, RuntimeError) as e:
            print(f"  WARNING: export_amax() failed ({e}), attempting CPU recovery...")
            if hasattr(quantizer, '_amax') and quantizer._amax is not None:
                quantizer._amax = quantizer._amax.cpu()
            amax = quantizer.export_amax()

        if amax is None:
            return None
        amax = amax.cpu()
        activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0)

        if not torch.all(activation_scaling_factor > 0):
            n_bad = (activation_scaling_factor <= 0).sum().item()
            n_total = activation_scaling_factor.numel()
            print(f"  WARNING: {n_bad}/{n_total} activation scaling factors <= 0, clamping")
            activation_scaling_factor = activation_scaling_factor.clamp(min=torch.finfo(torch.float32).tiny)

        return activation_scaling_factor

    nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor
    print("✓ Patch 3: NVFP4QTensor.get_activation_scaling_factor → CPU + clamp")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 4: _export_quantized_weight — THE KEY PATCH
    #
    # This is the entry point for exporting each quantized module. It reads
    # `weight = getattr(sub_module, weight_name)` which is on a stale GPU.
    # By moving weight to CPU right here, ALL downstream functions are safe:
    #   - get_weight_scaling_factor: weight.device is now CPU
    #   - get_weights_scaling_factor: operates on CPU weight
    #   - to_quantized_weight: .to(weight.device) stays on CPU
    #   - weight.to(dtype): CPU cast
    # We also force all quantizer state to CPU for the same reason.
    # ══════════════════════════════════════════════════════════════════════
    orig_export_quantized_weight = uehf._export_quantized_weight

    def patched_export_quantized_weight(sub_module, dtype, weight_name="weight"):
        # Move weight to CPU (stale GPU → safe CPU)
        weight = getattr(sub_module, weight_name, None)
        if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
            try:
                weight_cpu = weight.cpu()
                with torch.no_grad():
                    setattr(sub_module, weight_name, torch.nn.Parameter(weight_cpu))
            except (torch.cuda.CudaError, RuntimeError) as e:
                print(f"  WARNING: weight.cpu() failed for {weight_name} ({e})")
                raise

        # Force all quantizer state to CPU
        qattrs = _quantizer_attr_names(weight_name)
        for qattr in [qattrs.weight_quantizer, qattrs.input_quantizer, qattrs.output_quantizer]:
            if not qattr:
                continue
            quantizer = getattr(sub_module, qattr, None)
            if quantizer is None:
                continue
            for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
                val = getattr(quantizer, attr, None)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    try:
                        setattr(quantizer, attr, val.cpu())
                    except (torch.cuda.CudaError, RuntimeError):
                        pass
            # Handle SequentialQuantizer (W4A8 path)
            if hasattr(quantizer, 'quantizers'):
                for sub_q in quantizer.quantizers:
                    for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
                        val = getattr(sub_q, attr, None)
                        if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                            try:
                                setattr(sub_q, attr, val.cpu())
                            except (torch.cuda.CudaError, RuntimeError):
                                pass

        return orig_export_quantized_weight(sub_module, dtype, weight_name)

    uehf._export_quantized_weight = patched_export_quantized_weight
    print("✓ Patch 4: _export_quantized_weight → force weight + quantizer state to CPU")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 5: _export_fused_experts — same treatment for MoE expert weights
    # DeepseekV4Experts go through this different code path.
    # ══════════════════════════════════════════════════════════════════════
    orig_export_fused_experts = uehf._export_fused_experts

    def patched_export_fused_experts(sub_module, dtype):
        # Force all expert weights to CPU
        for name, param in list(sub_module.named_parameters()):
            if isinstance(param, torch.Tensor) and param.is_cuda:
                try:
                    with torch.no_grad():
                        setattr(sub_module, name, torch.nn.Parameter(param.cpu()))
                except (torch.cuda.CudaError, RuntimeError):
                    pass
        # Force all buffers to CPU
        for name, buf in list(sub_module.named_buffers()):
            if isinstance(buf, torch.Tensor) and buf.is_cuda:
                try:
                    sub_module.register_buffer(name, buf.cpu())
                except (torch.cuda.CudaError, RuntimeError):
                    pass
        # Force all quantizer state to CPU
        for mod in sub_module.modules():
            for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
                val = getattr(mod, attr, None)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    try:
                        setattr(mod, attr, val.cpu())
                    except (torch.cuda.CudaError, RuntimeError):
                        pass
        return orig_export_fused_experts(sub_module, dtype)

    uehf._export_fused_experts = patched_export_fused_experts
    print("✓ Patch 5: _export_fused_experts → force expert weights + quantizer state to CPU")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 6: to_quantized_weight — force scaling factors to CPU
    # This does .to(weight.device) on scaling factors. With weight now on
    # CPU (Patch 4), this should be a no-op, but belt-and-suspenders.
    # ══════════════════════════════════════════════════════════════════════
    orig_to_quantized_weight = quant_utils.to_quantized_weight

    def patched_to_quantized_weight(weight, weights_scaling_factor, quantization,
                                     weights_scaling_factor2=None, block_size=None):
        if isinstance(weight, torch.Tensor) and weight.is_cuda:
            weight = weight.cpu()
        if weights_scaling_factor is not None and isinstance(weights_scaling_factor, torch.Tensor) and weights_scaling_factor.is_cuda:
            weights_scaling_factor = weights_scaling_factor.cpu()
        if weights_scaling_factor2 is not None and isinstance(weights_scaling_factor2, torch.Tensor) and weights_scaling_factor2.is_cuda:
            weights_scaling_factor2 = weights_scaling_factor2.cpu()
        return orig_to_quantized_weight(weight, weights_scaling_factor, quantization,
                                          weights_scaling_factor2, block_size)

    quant_utils.to_quantized_weight = patched_to_quantized_weight
    print("✓ Patch 6: to_quantized_weight → force all tensors to CPU")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 7: get_weight_scaling_factor — force weight + quantizer to CPU
    # Belt and suspenders: Patch 4 should handle this, but this is also
    # called from other code paths.
    # ══════════════════════════════════════════════════════════════════════
    orig_get_weight_scaling_factor = quant_utils.get_weight_scaling_factor

    def patched_get_weight_scaling_factor(module, weight_name="weight"):
        weight = getattr(module, weight_name, None)
        if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
            try:
                with torch.no_grad():
                    setattr(module, weight_name, torch.nn.Parameter(weight.cpu()))
            except (torch.cuda.CudaError, RuntimeError) as e:
                print(f"  WARNING: weight.cpu() failed in get_weight_scaling_factor ({e})")
                raise
        weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
        if weight_quantizer is not None:
            for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
                val = getattr(weight_quantizer, attr, None)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    try:
                        setattr(weight_quantizer, attr, val.cpu())
                    except (torch.cuda.CudaError, RuntimeError):
                        pass
        return orig_get_weight_scaling_factor(module, weight_name)

    quant_utils.get_weight_scaling_factor = patched_get_weight_scaling_factor
    print("✓ Patch 7: get_weight_scaling_factor → force weight + quantizer to CPU")

    # ══════════════════════════════════════════════════════════════════════
    # Patch 8: get_weight_scaling_factor_2 — force quantizer to CPU
    # ══════════════════════════════════════════════════════════════════════
    orig_get_weight_scaling_factor_2 = quant_utils.get_weight_scaling_factor_2

    def patched_get_weight_scaling_factor_2(module, weight_name="weight"):
        weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
        if weight_quantizer is not None:
            for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
                val = getattr(weight_quantizer, attr, None)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    try:
                        setattr(weight_quantizer, attr, val.cpu())
                    except (torch.cuda.CudaError, RuntimeError):
                        pass
        return orig_get_weight_scaling_factor_2(module, weight_name)

    quant_utils.get_weight_scaling_factor_2 = patched_get_weight_scaling_factor_2
    print("✓ Patch 8: get_weight_scaling_factor_2 → force quantizer to CPU")


def snapshot_amax_to_cpu(model, snapshot_path):
    """Walk all quantizers, copy _amax to CPU, save to disk."""
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    print(f"\nSnapshotting quantizer _amax to CPU...")
    t0 = time.time()
    snapshots = {}
    n_moved = 0

    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        if hasattr(module, '_amax') and module._amax is not None:
            amax_cpu = module._amax.detach().cpu().clone()
            snapshots[name] = amax_cpu
            module._amax.data.copy_(amax_cpu)
            n_moved += 1

    torch.save(snapshots, snapshot_path)
    size_mb = os.path.getsize(snapshot_path) / (1024**2)
    print(f"✓ Snapshotted {n_moved} quantizer _amax tensors to CPU ({time.time()-t0:.1f}s)")
    print(f"  Saved to: {snapshot_path} ({size_mb:.1f} MB)")
    return snapshots


def restore_amax_from_snapshot(model, snapshot_path):
    """Restore _amax from a previously saved CPU snapshot."""
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    print(f"Restoring _amax from snapshot: {snapshot_path}")
    snapshots = torch.load(snapshot_path, map_location='cpu')
    n_restored = 0

    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        if name in snapshots and hasattr(module, '_amax'):
            module._amax.data.copy_(snapshots[name].to(module._amax.device))
            n_restored += 1

    print(f"✓ Restored {n_restored} _amax tensors from snapshot")


def force_all_amax_to_cpu(model):
    """Force ALL quantizer tensors to CPU."""
    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer

    count = 0
    for name, module in model.named_modules():
        if not isinstance(module, TensorQuantizer):
            continue
        for attr in ['_amax', '_pre_quant_scale', '_global_amax']:
            if hasattr(module, attr):
                val = getattr(module, attr)
                if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
                    setattr(module, attr, val.cpu())
                    count += 1
    print(f"✓ Forced {count} quantizer tensors to CPU")


def save_calibrated_state(model, path):
    """Save model state dict after calibration."""
    print(f"\n{'='*60}")
    print(f"SAVING CALIBRATED STATE → {path}")
    print(f"{'='*60}")

    start = time.time()
    state = {
        'model_state_dict': model.state_dict(),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    }
    torch.save(state, path)
    size_gb = os.path.getsize(path) / (1024**3)
    print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)")
    print(f"  Path: {path}")
    print(f"  Re-run with --export-only to retry export.\n")


def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, calib_size, calib_seq):
    """Full pipeline: parse args via hf_ptq → load → quantize → snapshot → save → export."""

    os.chdir(EXAMPLE_DIR)
    sys.path.insert(0, EXAMPLE_DIR)

    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

    from hf_ptq import parse_args, main as hf_main

    apply_patches()

    # ── Build args using hf_ptq's own parser ──
    # This guarantees ALL attributes exist with correct defaults.
    # We temporarily replace sys.argv so parse_args() sees our config.
    saved_argv = sys.argv
    sys.argv = [
        "hf_ptq.py",
        "--pyt_ckpt_path", model_path,
        "--qformat", QUANT,
        "--calib_size", str(calib_size),
        "--calib_seq", str(calib_seq),
        "--kv_cache_qformat", KV_CACHE_QUANT,
        "--inference_tensor_parallel", str(TP),
        "--export_path", export_dir,
        "--trust_remote_code",
        "--use_seq_device_map",
        "--gpu_max_mem_percentage", str(GPU_MEM_PCT),
        "--batch_size", "0",
    ]
    args = parse_args()
    sys.argv = saved_argv

    # Apply the same post-parse conversions that hf_ptq's __main__ block does
    # (these normally run between parse_args() and main() in the original script,
    # but since we call main() directly, we have to do them ourselves)
    args.dataset = args.dataset.split(",") if isinstance(args.dataset, str) else args.dataset
    args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]

    # ── Post-calibration hook ──
    # We monkey-patch export_quantized to add our defensive saves before export.
    import hf_ptq

    orig_export_quantized = hf_ptq.export_quantized

    def patched_export_quantized(exp_args, full_model, language_model, model_type,
                                  tokenizer, default_padding_side, default_pad_token):
        """Wrapper that snapshots amax and saves state before calling the real export."""
        print("\n" + "="*60)
        print("POST-CALIBRATION: Snapshotting amax and saving state")
        print("="*60)

        # Snapshot amax to CPU
        snapshot_amax_to_cpu(language_model, amax_snapshot_path)

        # Force all quantizer state to CPU
        force_all_amax_to_cpu(language_model)

        # Free GPU memory
        torch.cuda.empty_cache()
        gc.collect()

        # Save calibrated state
        save_calibrated_state(language_model, calib_save_path)

        # Now run the real export
        orig_export_quantized(exp_args, full_model, language_model, model_type,
                             tokenizer, default_padding_side, default_pad_token)

    hf_ptq.export_quantized = patched_export_quantized
    print("✓ Hooked export_quantized with amax snapshot + state save")

    # ── Run hf_ptq's full pipeline ──
    # This handles model loading, quantization, calibration, and export
    # using the exact same code path as the shell script.
    hf_main(args)


def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir):
    """Load saved calibration state and run export only."""

    os.chdir(EXAMPLE_DIR)
    sys.path.insert(0, EXAMPLE_DIR)

    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

    apply_patches()

    from example_utils import get_model, get_tokenizer

    print(f"Loading model from {model_path}...")
    model = get_model(
        model_path,
        device="cpu",
        trust_remote_code=True,
    )
    tokenizer = get_tokenizer(model_path, trust_remote_code=True)

    print(f"Loading calibrated state from {calib_save_path}...")
    state = torch.load(calib_save_path, map_location='cpu')
    model.load_state_dict(state['model_state_dict'])
    print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")

    force_all_amax_to_cpu(model)
    if amax_snapshot_path and os.path.exists(amax_snapshot_path):
        restore_amax_from_snapshot(model, amax_snapshot_path)

    torch.cuda.empty_cache()
    gc.collect()

    from modelopt.torch.export import export_hf_checkpoint
    from hf_ptq import load_mtp_weights, copy_custom_model_files

    print(f"\n{'='*60}")
    print(f"EXPORTING → {export_dir}")
    print(f"{'='*60}")

    t0 = time.time()
    try:
        mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path)
        if mtp_layer_prefixes:
            model._mtp_layer_prefixes = mtp_layer_prefixes

        export_hf_checkpoint(model, export_dir=export_dir, extra_state_dict=mtp_state_dict)
        tokenizer.save_pretrained(export_dir)
        copy_custom_model_files(model_path, export_dir, True)
        print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}")
    except Exception as e:
        print(f"\n✗ EXPORT FAILED: {e}")
        print(f"  Calibrated state: {CALIB_SAVE_PATH}")
        print(f"  Amax snapshots: {AMAX_SNAPSHOT_PATH}")
        raise


def run_validate(calib_save_path, amax_snapshot_path):
    """Validate saved calibration state — check amax values are valid."""
    print(f"\nValidating calibration state...")

    if os.path.exists(amax_snapshot_path):
        snapshots = torch.load(amax_snapshot_path, map_location='cpu')
        n_total = len(snapshots)
        n_valid = n_zero = n_nan = n_neg = 0

        for name, amax in snapshots.items():
            if torch.any(torch.isnan(amax)):
                n_nan += 1
            elif torch.any(amax < 0):
                n_neg += 1
            elif torch.all(amax == 0):
                n_zero += 1
            else:
                n_valid += 1

        print(f"\nAmax snapshot validation:")
        print(f"  Total: {n_total}  Valid: {n_valid}  Zero: {n_zero}  Neg: {n_neg}  NaN: {n_nan}")
        if n_valid == n_total:
            print(f"\n✓ All {n_total} amax snapshots are valid!")
        else:
            print(f"\n✗ {n_total - n_valid} quantizers have invalid amax!")
    else:
        print(f"✗ No amax snapshot found at {amax_snapshot_path}")

    if os.path.exists(calib_save_path):
        size_gb = os.path.getsize(calib_save_path) / (1024**3)
        print(f"\nCalibrated state: {calib_save_path} ({size_gb:.1f} GB)")
    else:
        print(f"\n✗ No calibrated state found at {calib_save_path}")


def main():
    parser = argparse.ArgumentParser(description="DeepSeek V4 Pro NVFP4 Quantization")
    parser.add_argument("--export-only", action="store_true",
                        help="Skip calibration, load saved state and run export only")
    parser.add_argument("--validate-only", action="store_true",
                        help="Validate saved calibration state without running anything")
    parser.add_argument("--model", default=MODEL, help="Path to BF16 model")
    parser.add_argument("--export-dir", default=EXPORT_DIR, help="Export output directory")
    parser.add_argument("--calib-save", default=CALIB_SAVE_PATH, help="Calibration state save path")
    parser.add_argument("--amax-snapshot", default=AMAX_SNAPSHOT_PATH, help="Amax snapshot path")
    parser.add_argument("--calib-size", type=int, default=CALIB_SIZE, help="Calibration samples")
    parser.add_argument("--calib-seq", type=int, default=CALIB_SEQ, help="Calibration sequence length")
    args = parser.parse_args()

    if args.validate_only:
        run_validate(args.calib_save, args.amax_snapshot)
    elif args.export_only:
        if not os.path.exists(args.calib_save):
            print(f"ERROR: No calibration state found at {args.calib_save}")
            sys.exit(1)
        run_export_only(args.calib_save, args.amax_snapshot, args.model, args.export_dir)
    else:
        run_calibration(args.model, args.export_dir, args.calib_save,
                        args.amax_snapshot, args.calib_size, args.calib_seq)


if __name__ == "__main__":
    main()