#!/usr/bin/env python3 """ DeepSeek V4 Pro → NVFP4 quantization. Runs the full ModelOpt PTQ pipeline in-process (not wrapping the shell script), saves model state after calibration (so we don't lose 6 hours of work to an export crash), and patches the export path to handle stale GPU tensors. Must be run from the modelopt example directory for imports: cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py Usage: # Full run (calibrate + export): python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py # Re-run export only (after a calibration save exists): python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only """ import argparse import copy import os import sys import time import warnings import torch # ── Config ────────────────────────────────────────────────────────────────── MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16" QUANT = "nvfp4" TP = 8 CALIB_SIZE = 128 CALIB_SEQ = 512 KV_CACHE_QUANT = "fp8_cast" GPU_MEM_PCT = 0.7 HF_TOKEN = "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" # Paths EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq" EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt" def apply_patches(): """Apply runtime patches for V4 compatibility.""" from modelopt.torch.quantization.nn import quant_module from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module # 1. Patch quant_module.py for V4's ModuleList expert quantizers orig_iter = quant_module._QuantFusedExperts.iter_weights_for_calibration def patched_iter_weights_for_calibration(self, **kwargs): """Handle V4's nn.ModuleList expert quantizers (vs singular TensorQuantizer).""" for name, quantizer in self.named_modules(): if not isinstance(quantizer, quant_module.TensorQuantizer): continue if quantizer.is_enabled: yield name, quantizer quant_module._QuantFusedExperts.iter_weights_for_calibration = patched_iter_weights_for_calibration print("✓ Patched _QuantFusedExperts.iter_weights_for_calibration for V4 ModuleList") # 2. Patch TensorQuantizer.export_amax to move _amax to CPU before reading orig_export_amax = tq_module.TensorQuantizer.export_amax def patched_export_amax(self): """Move _amax to CPU before export to prevent CUDA illegal memory access on tensors that have been sitting in VRAM for hours during calibration.""" if self.amax is not None and self.amax.is_cuda: self._amax = self._amax.cpu() return orig_export_amax(self) tq_module.TensorQuantizer.export_amax = patched_export_amax print("✓ Patched TensorQuantizer.export_amax (CPU safety)") # 3. Patch NVFP4QTensor.get_activation_scaling_factor for graceful degradation from modelopt.torch.quantization.qtensor import nvfp4_tensor orig_get_asf = nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor @classmethod def patched_get_activation_scaling_factor(cls, quantizer): """Move amax to CPU before export; clamp instead of assert on bad values.""" if not quantizer.is_enabled: return None try: amax = quantizer.export_amax() except (torch.cuda.CudaError, RuntimeError) as e: print(f" WARNING: export_amax() failed ({e}), attempting CPU recovery...") if hasattr(quantizer, '_amax') and quantizer._amax is not None: quantizer._amax = quantizer._amax.cpu() amax = quantizer.export_amax() if amax is None: return None amax = amax.cpu() activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0) # Replace hard assert with warning + clamp if not torch.all(activation_scaling_factor > 0): n_bad = (activation_scaling_factor <= 0).sum().item() n_total = activation_scaling_factor.numel() print(f" WARNING: {n_bad}/{n_total} activation scaling factors <= 0, clamping to tiny") activation_scaling_factor = activation_scaling_factor.clamp(min=torch.finfo(torch.float32).tiny) return activation_scaling_factor nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor print("✓ Patched NVFP4QTensor.get_activation_scaling_factor (CPU safety + graceful degradation)") def move_quantizers_to_cpu(model): """Move all quantizer amax tensors to CPU to prevent stale GPU reads during export.""" from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer count = 0 for name, module in model.named_modules(): if isinstance(module, TensorQuantizer): if hasattr(module, '_amax') and module._amax is not None: if module._amax.is_cuda: module._amax = module._amax.cpu() count += 1 print(f"✓ Moved {count} quantizer _amax tensors to CPU") def save_calibrated_state(model, path): """Save model state dict after calibration. Insurance policy: if export crashes, we can reload and retry without re-running 6 hours of calibration. """ print(f"\n{'='*60}") print(f"SAVING CALIBRATED STATE → {path}") print(f"{'='*60}") start = time.time() # Move quantizers to CPU first move_quantizers_to_cpu(model) state = { 'model_state_dict': model.state_dict(), 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), } torch.save(state, path) size_gb = os.path.getsize(path) / (1024**3) print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)") print(f" Path: {path}") print(f" Re-run with --export-only to retry export without recalibrating.\n") def run_calibration(model_path, export_dir, calib_save_path, calib_size, calib_seq): """Full pipeline: load → quantize → calibrate → save → export.""" # Must be in the example dir for relative imports (example_utils, etc.) os.chdir(EXAMPLE_DIR) sys.path.insert(0, EXAMPLE_DIR) # Set HF token for gated datasets os.environ["HF_TOKEN"] = HF_TOKEN os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN # These imports depend on the example dir being in sys.path from hf_ptq import ( get_model, get_tokenizer, make_calib_dataloader, build_quant_cfg, load_mtp_weights, copy_custom_model_files, QUANT_CFG_CHOICES, ) from modelopt.torch import quantization as mtq from modelopt.torch.quantization.config import need_calibration from modelopt.torch.utils.dataset_utils import get_max_batch_size from modelopt.torch.export import export_hf_checkpoint from transformers import AutoModelForCausalLM, AutoTokenizer # Apply patches before loading model apply_patches() # ── Load model ── print(f"\nLoading model from {model_path}...") t0 = time.time() model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="sequential", offload_folder="offload", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print(f"✓ Model loaded in {time.time()-t0:.0f}s") # ── Setup quantization config ── quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[QUANT]) quant_cfg = build_quant_cfg(QUANT, quant_cfg, None, None, None) if KV_CACHE_QUANT != "none": quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant( quant_cfg, getattr(mtq, mtq.KV_QUANT_CFG_CHOICES[KV_CACHE_QUANT])["quant_cfg"], ) print(f"✓ KV cache quantization: {KV_CACHE_QUANT}") # ── Detect batch size ── print("\nDetecting max calibration batch size...") batch_size = get_max_batch_size( model, max_sample_length=calib_seq, sample_memory_usage_ratio=1.1, ) batch_size = min(batch_size, calib_size) print(f"✓ Using calibration batch_size={batch_size}") # ── Prepare dataloader ── # Build a minimal args namespace for make_calib_dataloader args = argparse.Namespace( calib_size=[calib_size], calib_seq=calib_seq, calib_dataset="", batch_size=batch_size, calib_batch_size=0, ) calib_dataloader, _ = make_calib_dataloader( args, model, None, tokenizer, torch.device("cuda"), None, ) # ── Quantize + Calibrate ── print(f"\n{'='*60}") print(f"QUANTIZING: {QUANT} with {calib_size} calibration samples") print(f"{'='*60}") t0 = time.time() model = mtq.quantize(model, quant_cfg, forward_loop=calib_dataloader) print(f"✓ Quantization + calibration complete in {time.time()-t0:.0f}s") # ── SAVE STATE (the whole point of this script) ── save_calibrated_state(model, calib_save_path) # ── Export ── run_export(model, tokenizer, model_path, export_dir) def run_export(model, tokenizer, model_path, export_dir): """Export the quantized model to HF safetensors format.""" from modelopt.torch.export import export_hf_checkpoint from hf_ptq import load_mtp_weights, copy_custom_model_files print(f"\n{'='*60}") print(f"EXPORTING → {export_dir}") print(f"{'='*60}") move_quantizers_to_cpu(model) t0 = time.time() try: mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path) if mtp_layer_prefixes: model._mtp_layer_prefixes = mtp_layer_prefixes export_hf_checkpoint( model, export_dir=export_dir, extra_state_dict=mtp_state_dict, ) tokenizer.save_pretrained(export_dir) copy_custom_model_files(model_path, export_dir, True) print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}") except Exception as e: print(f"\n✗ EXPORT FAILED: {e}") print(f" Calibrated state is saved at: {CALIB_SAVE_PATH}") print(f" Re-run with --export-only to retry export") raise def run_export_only(calib_save_path, model_path, export_dir): """Load previously saved calibration state and run export only.""" os.chdir(EXAMPLE_DIR) sys.path.insert(0, EXAMPLE_DIR) os.environ["HF_TOKEN"] = HF_TOKEN os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN apply_patches() from transformers import AutoModelForCausalLM, AutoTokenizer print(f"Loading model skeleton from {model_path}...") model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Load the calibrated state print(f"Loading calibrated state from {calib_save_path}...") state = torch.load(calib_save_path, map_location='cpu') model.load_state_dict(state['model_state_dict']) print(f"✓ Loaded calibrated state (saved at {state['timestamp']})") run_export(model, tokenizer, model_path, export_dir) def main(): parser = argparse.ArgumentParser(description="DeepSeek V4 Pro NVFP4 Quantization") parser.add_argument("--export-only", action="store_true", help="Skip calibration, load saved state and run export only") parser.add_argument("--model", default=MODEL, help="Path to BF16 model") parser.add_argument("--export-dir", default=EXPORT_DIR, help="Export output directory") parser.add_argument("--calib-save", default=CALIB_SAVE_PATH, help="Calibration state save path") parser.add_argument("--calib-size", type=int, default=CALIB_SIZE, help="Calibration samples") parser.add_argument("--calib-seq", type=int, default=CALIB_SEQ, help="Calibration sequence length") args = parser.parse_args() if args.export_only: if not os.path.exists(args.calib_save): print(f"ERROR: No calibration state found at {args.calib_save}") print("Run without --export-only first to calibrate.") sys.exit(1) run_export_only(args.calib_save, args.model, args.export_dir) else: run_calibration(args.model, args.export_dir, args.calib_save, args.calib_size, args.calib_seq) if __name__ == "__main__": main()