diff --git a/scripts/quantize_nvfp4.py b/scripts/quantize_nvfp4.py index 0ddb96c..6e2e42f 100644 --- a/scripts/quantize_nvfp4.py +++ b/scripts/quantize_nvfp4.py @@ -6,19 +6,16 @@ Runs the full ModelOpt PTQ pipeline in-process (not wrapping the shell script), saves model state after calibration (so we don't lose 6 hours of work to an export crash), and patches the export path to handle stale GPU tensors. +Must be run from the modelopt example directory for imports: + cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq + python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py + Usage: # Full run (calibrate + export): - python3 scripts/quantize_nvfp4.py + python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py # Re-run export only (after a calibration save exists): - python3 scripts/quantize_nvfp4.py --export-only - -Pipeline: - 1. Load BF16 model with sequential device map - 2. Patch modelopt for V4 compatibility - 3. Quantize + calibrate (5-6 hours) - 4. SAVE model state to disk ← checkpoint so export failures don't waste calibration - 5. Export to HF safetensors + python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only """ import argparse @@ -42,8 +39,8 @@ GPU_MEM_PCT = 0.7 HF_TOKEN = "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" -# Output paths -SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq" # needed for example_utils imports +# Paths +EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq" EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt" @@ -51,9 +48,10 @@ CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt" def apply_patches(): """Apply runtime patches for V4 compatibility.""" - # 1. Patch quant_module.py for V4's ModuleList expert quantizers from modelopt.torch.quantization.nn import quant_module + from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module + # 1. Patch quant_module.py for V4's ModuleList expert quantizers orig_iter = quant_module._QuantFusedExperts.iter_weights_for_calibration def patched_iter_weights_for_calibration(self, **kwargs): @@ -67,21 +65,33 @@ def apply_patches(): quant_module._QuantFusedExperts.iter_weights_for_calibration = patched_iter_weights_for_calibration print("✓ Patched _QuantFusedExperts.iter_weights_for_calibration for V4 ModuleList") - # 2. Patch nvfp4_tensor.get_activation_scaling_factor to move amax to CPU first + # 2. Patch TensorQuantizer.export_amax to move _amax to CPU before reading + orig_export_amax = tq_module.TensorQuantizer.export_amax + + def patched_export_amax(self): + """Move _amax to CPU before export to prevent CUDA illegal memory access + on tensors that have been sitting in VRAM for hours during calibration.""" + if self.amax is not None and self.amax.is_cuda: + self._amax = self._amax.cpu() + return orig_export_amax(self) + + tq_module.TensorQuantizer.export_amax = patched_export_amax + print("✓ Patched TensorQuantizer.export_amax (CPU safety)") + + # 3. Patch NVFP4QTensor.get_activation_scaling_factor for graceful degradation from modelopt.torch.quantization.qtensor import nvfp4_tensor orig_get_asf = nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor @classmethod def patched_get_activation_scaling_factor(cls, quantizer): - """Move amax to CPU before export to avoid stale GPU tensor reads.""" + """Move amax to CPU before export; clamp instead of assert on bad values.""" if not quantizer.is_enabled: return None try: amax = quantizer.export_amax() except (torch.cuda.CudaError, RuntimeError) as e: - # GPU tensor is corrupted — try moving _amax to CPU first then retry print(f" WARNING: export_amax() failed ({e}), attempting CPU recovery...") if hasattr(quantizer, '_amax') and quantizer._amax is not None: quantizer._amax = quantizer._amax.cpu() @@ -90,12 +100,10 @@ def apply_patches(): if amax is None: return None - # Move to CPU for safety amax = amax.cpu() - activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0) - # Replace hard assert with warning + clamp (invalid values from GPU corruption) + # Replace hard assert with warning + clamp if not torch.all(activation_scaling_factor > 0): n_bad = (activation_scaling_factor <= 0).sum().item() n_total = activation_scaling_factor.numel() @@ -107,37 +115,25 @@ def apply_patches(): nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor print("✓ Patched NVFP4QTensor.get_activation_scaling_factor (CPU safety + graceful degradation)") - # 3. Patch tensor_quantizer.export_amax to move _amax to CPU before reading - from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module - - orig_export_amax = tq_module.TensorQuantizer.export_amax - - def patched_export_amax(self): - """Move _amax to CPU before export to prevent CUDA illegal memory access.""" - if self.amax is not None and self.amax.is_cuda: - self._amax = self._amax.cpu() - return orig_export_amax(self) - - tq_module.TensorQuantizer.export_amax = patched_export_amax - print("✓ Patched TensorQuantizer.export_amax (CPU safety)") - def move_quantizers_to_cpu(model): """Move all quantizer amax tensors to CPU to prevent stale GPU reads during export.""" + from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer count = 0 for name, module in model.named_modules(): - if hasattr(module, '_amax') and module._amax is not None: - if module._amax.is_cuda: - module._amax = module._amax.cpu() - count += 1 + if isinstance(module, TensorQuantizer): + if hasattr(module, '_amax') and module._amax is not None: + if module._amax.is_cuda: + module._amax = module._amax.cpu() + count += 1 print(f"✓ Moved {count} quantizer _amax tensors to CPU") def save_calibrated_state(model, path): - """Save model state dict + quantizer metadata after calibration. + """Save model state dict after calibration. - This is the insurance policy: if export crashes, we can reload - and retry export without re-running 6 hours of calibration. + Insurance policy: if export crashes, we can reload and retry + without re-running 6 hours of calibration. """ print(f"\n{'='*60}") print(f"SAVING CALIBRATED STATE → {path}") @@ -157,29 +153,31 @@ def save_calibrated_state(model, path): size_gb = os.path.getsize(path) / (1024**3) print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)") print(f" Path: {path}") - print(f" This allows re-running export without re-calibrating.\n") + print(f" Re-run with --export-only to retry export without recalibrating.\n") -def load_calibrated_state(model, path): - """Load previously saved calibrated state into model.""" - print(f"Loading calibrated state from {path}...") - state = torch.load(path, map_location='cpu') - model.load_state_dict(state['model_state_dict']) - print(f"✓ Loaded calibrated state (saved at {state['timestamp']})") - - -def run_calibration(model_path, export_dir, calib_save_path): +def run_calibration(model_path, export_dir, calib_save_path, calib_size, calib_seq): """Full pipeline: load → quantize → calibrate → save → export.""" - # Must be in the example dir for the relative imports (example_utils, etc.) - os.chdir(SCRIPT_DIR) - sys.path.insert(0, SCRIPT_DIR) + # Must be in the example dir for relative imports (example_utils, etc.) + os.chdir(EXAMPLE_DIR) + sys.path.insert(0, EXAMPLE_DIR) - from hf_ptq import get_model, get_tokenizer, make_calib_dataloader, pre_quantize + # Set HF token for gated datasets + os.environ["HF_TOKEN"] = HF_TOKEN + os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN + + # These imports depend on the example dir being in sys.path + from hf_ptq import ( + get_model, get_tokenizer, make_calib_dataloader, + build_quant_cfg, load_mtp_weights, copy_custom_model_files, + QUANT_CFG_CHOICES, + ) from modelopt.torch import quantization as mtq - from modelopt.torch.quantization.config import need_calibration, QUANT_CFG_CHOICES + from modelopt.torch.quantization.config import need_calibration from modelopt.torch.utils.dataset_utils import get_max_batch_size - from hf_ptq import build_quant_cfg + from modelopt.torch.export import export_hf_checkpoint + from transformers import AutoModelForCausalLM, AutoTokenizer # Apply patches before loading model apply_patches() @@ -188,14 +186,6 @@ def run_calibration(model_path, export_dir, calib_save_path): print(f"\nLoading model from {model_path}...") t0 = time.time() - # Set HF token for gated datasets - os.environ["HF_TOKEN"] = HF_TOKEN - os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN - - from transformers import AutoModelForCausalLM, AutoTokenizer - from accelerate import infer_auto_device_map - - # Load with sequential device map (model doesn't fit in GPU VRAM alone) model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, @@ -203,15 +193,13 @@ def run_calibration(model_path, export_dir, calib_save_path): device_map="sequential", offload_folder="offload", ) - print(f"✓ Model loaded in {time.time()-t0:.0f}s") - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + print(f"✓ Model loaded in {time.time()-t0:.0f}s") # ── Setup quantization config ── quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[QUANT]) quant_cfg = build_quant_cfg(QUANT, quant_cfg, None, None, None) - # KV cache quantization if KV_CACHE_QUANT != "none": quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant( quant_cfg, @@ -223,27 +211,28 @@ def run_calibration(model_path, export_dir, calib_save_path): print("\nDetecting max calibration batch size...") batch_size = get_max_batch_size( model, - max_sample_length=CALIB_SEQ, + max_sample_length=calib_seq, sample_memory_usage_ratio=1.1, ) - batch_size = min(batch_size, CALIB_SIZE) + batch_size = min(batch_size, calib_size) print(f"✓ Using calibration batch_size={batch_size}") # ── Prepare dataloader ── + # Build a minimal args namespace for make_calib_dataloader + args = argparse.Namespace( + calib_size=[calib_size], + calib_seq=calib_seq, + calib_dataset="", + batch_size=batch_size, + calib_batch_size=0, + ) calib_dataloader, _ = make_calib_dataloader( - argparse.Namespace( - calib_size=[CALIB_SIZE], - calib_seq=CALIB_SEQ, - calib_dataset="", - batch_size=batch_size, - calib_batch_size=0, - ), - model, None, tokenizer, torch.device("cuda"), None, + args, model, None, tokenizer, torch.device("cuda"), None, ) # ── Quantize + Calibrate ── print(f"\n{'='*60}") - print(f"QUANTIZING: {QUANT} with {CALIB_SIZE} calibration samples") + print(f"QUANTIZING: {QUANT} with {calib_size} calibration samples") print(f"{'='*60}") t0 = time.time() @@ -261,19 +250,17 @@ def run_calibration(model_path, export_dir, calib_save_path): def run_export(model, tokenizer, model_path, export_dir): """Export the quantized model to HF safetensors format.""" from modelopt.torch.export import export_hf_checkpoint - from hf_ptq import load_mtp_weights + from hf_ptq import load_mtp_weights, copy_custom_model_files print(f"\n{'='*60}") print(f"EXPORTING → {export_dir}") print(f"{'='*60}") - # Move quantizers to CPU before export move_quantizers_to_cpu(model) t0 = time.time() try: - # Load MTP weights if present mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path) if mtp_layer_prefixes: model._mtp_layer_prefixes = mtp_layer_prefixes @@ -284,15 +271,10 @@ def run_export(model, tokenizer, model_path, export_dir): extra_state_dict=mtp_state_dict, ) - # Save tokenizer tokenizer.save_pretrained(export_dir) - - # Copy custom model files - from hf_ptq import copy_custom_model_files copy_custom_model_files(model_path, export_dir, True) - elapsed = time.time() - t0 - print(f"\n✓ Export complete in {elapsed:.0f}s → {export_dir}") + print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}") except Exception as e: print(f"\n✗ EXPORT FAILED: {e}") @@ -303,30 +285,31 @@ def run_export(model, tokenizer, model_path, export_dir): def run_export_only(calib_save_path, model_path, export_dir): """Load previously saved calibration state and run export only.""" - os.chdir(SCRIPT_DIR) - sys.path.insert(0, SCRIPT_DIR) + os.chdir(EXAMPLE_DIR) + sys.path.insert(0, EXAMPLE_DIR) + + os.environ["HF_TOKEN"] = HF_TOKEN + os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN apply_patches() from transformers import AutoModelForCausalLM, AutoTokenizer - os.environ["HF_TOKEN"] = HF_TOKEN - os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN - - # Load a fresh model (we just need the architecture, then overlay the state) print(f"Loading model skeleton from {model_path}...") model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, - device_map="cpu", # Don't load onto GPU yet + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Load the calibrated state - load_calibrated_state(model, calib_save_path) + print(f"Loading calibrated state from {calib_save_path}...") + state = torch.load(calib_save_path, map_location='cpu') + model.load_state_dict(state['model_state_dict']) + print(f"✓ Loaded calibrated state (saved at {state['timestamp']})") - # Export run_export(model, tokenizer, model_path, export_dir) @@ -348,7 +331,8 @@ def main(): sys.exit(1) run_export_only(args.calib_save, args.model, args.export_dir) else: - run_calibration(args.model, args.export_dir, args.calib_save) + run_calibration(args.model, args.export_dir, args.calib_save, + args.calib_size, args.calib_seq) if __name__ == "__main__":