From 6c1bff699722cd49708a3de8d33dc86f72d598c8 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 9 May 2026 09:26:23 +0000 Subject: [PATCH] Clean rewrite: verified all imports against runtime, removed dead code - get_model/get_tokenizer imported from example_utils (not hf_ptq) - KV_QUANT_CFG_CHOICES imported from hf_ptq (not mtq) - Removed dead _FORCE_AMAX_CPU global and global reference in run_export_only - Fixed stale comments - All 16 imports and references verified against the actual B200 runtime - Zero divergences from modelopt example path except get_model() --- scripts/quantize_nvfp4.py | 91 ++++++++++++++------------------------- 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/scripts/quantize_nvfp4.py b/scripts/quantize_nvfp4.py index edce585..c671f81 100644 --- a/scripts/quantize_nvfp4.py +++ b/scripts/quantize_nvfp4.py @@ -61,18 +61,18 @@ def apply_patches(): """Apply runtime patches for V4 compatibility and GPU tensor safety.""" from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module + from modelopt.torch.quantization.qtensor import nvfp4_tensor # ── Patch 1: load_calib_amax — force _amax to CPU after calibration ── # # load_calib_amax() is called by max_calibrate() after the forward loop # finishes. It writes _amax to GPU by default. We patch it so _amax - # goes to CPU instead, preventing GPU corruption during the long wait - # before export. + # goes to CPU immediately, preventing GPU corruption during the long + # wait before export. orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax def patched_load_calib_amax(self, *args, **kwargs): orig_load_calib_amax(self, *args, **kwargs) - # After _amax is written, move it to CPU if hasattr(self, '_amax') and self._amax is not None: self._amax = self._amax.cpu() @@ -92,8 +92,6 @@ def apply_patches(): print("✓ Patched TensorQuantizer.export_amax (CPU fallback)") # ── Patch 3: NVFP4QTensor.get_activation_scaling_factor — graceful degradation ── - from modelopt.torch.quantization.qtensor import nvfp4_tensor - @classmethod def patched_get_activation_scaling_factor(cls, quantizer): if not quantizer.is_enabled: @@ -111,8 +109,6 @@ def apply_patches(): amax = amax.cpu() activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0) - # Clamp instead of hard assert — bad values from GPU corruption should - # not kill the entire 6-hour run if not torch.all(activation_scaling_factor > 0): n_bad = (activation_scaling_factor <= 0).sum().item() n_total = activation_scaling_factor.numel() @@ -128,13 +124,10 @@ def apply_patches(): def snapshot_amax_to_cpu(model, snapshot_path): """Walk all quantizers, copy their _amax to CPU, save to disk. - This is the core defensive measure. After calibration completes, the _amax - tensors are fresh and valid on GPU. We copy them to CPU immediately and - save to disk. This costs almost nothing (~50MB for ~49K quantizers) but - guarantees we have valid calibration data even if CUDA corrupts the GPU - copies later. - - Returns the snapshot dict: {quantizer_name: amax_tensor_on_cpu} + After calibration completes, the _amax tensors are fresh and valid on GPU. + We copy them to CPU immediately and save to disk. This costs almost nothing + (~50MB for ~49K quantizers) but guarantees we have valid calibration data + even if CUDA corrupts the GPU copies later. """ from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer @@ -147,14 +140,11 @@ def snapshot_amax_to_cpu(model, snapshot_path): if not isinstance(module, TensorQuantizer): continue if hasattr(module, '_amax') and module._amax is not None: - # Copy to CPU immediately amax_cpu = module._amax.detach().cpu().clone() snapshots[name] = amax_cpu - # Replace the GPU copy with the CPU copy module._amax.data.copy_(amax_cpu) n_moved += 1 - # Save snapshots to disk torch.save(snapshots, snapshot_path) size_mb = os.path.getsize(snapshot_path) / (1024**2) print(f"✓ Snapshotted {n_moved} quantizer _amax tensors to CPU ({time.time()-t0:.1f}s)") @@ -164,11 +154,7 @@ def snapshot_amax_to_cpu(model, snapshot_path): def restore_amax_from_snapshot(model, snapshot_path): - """Restore _amax from a previously saved CPU snapshot. - - Used by --export-only to guarantee valid amax values even if the - model state dict has corrupted GPU tensors. - """ + """Restore _amax from a previously saved CPU snapshot.""" from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer print(f"Restoring _amax from snapshot: {snapshot_path}") @@ -186,11 +172,7 @@ def restore_amax_from_snapshot(model, snapshot_path): def force_all_amax_to_cpu(model): - """Force ALL quantizer tensors to CPU. Nuclear option after calibration. - - After calling this, no quantizer _amax lives on GPU. Export can't hit - CUDA illegal memory access because there's nothing on GPU to corrupt. - """ + """Force ALL quantizer tensors to CPU. Nuclear option after calibration.""" from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer count = 0 @@ -207,18 +189,13 @@ def force_all_amax_to_cpu(model): def save_calibrated_state(model, path): - """Save model state dict after calibration. - - The insurance policy: if export crashes, we can reload and retry - without re-running 6 hours of calibration. - """ + """Save model state dict after calibration.""" print(f"\n{'='*60}") print(f"SAVING CALIBRATED STATE → {path}") print(f"{'='*60}") start = time.time() - # All quantizer state should already be on CPU from snapshot_amax_to_cpu state = { 'model_state_dict': model.state_dict(), 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), @@ -234,35 +211,36 @@ def save_calibrated_state(model, path): def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, calib_size, calib_seq): """Full pipeline: load → quantize → calibrate → snapshot → save → export.""" - global _FORCE_AMAX_CPU - os.chdir(EXAMPLE_DIR) sys.path.insert(0, EXAMPLE_DIR) os.environ["HF_TOKEN"] = HF_TOKEN os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN + # Import from hf_ptq and modelopt — all verified against the example script + from example_utils import get_model, get_tokenizer from hf_ptq import ( - get_model as modelopt_get_model, get_tokenizer, make_calib_dataloader, - build_quant_cfg, load_mtp_weights, copy_custom_model_files, - QUANT_CFG_CHOICES, KV_QUANT_CFG_CHOICES, + make_calib_dataloader, + build_quant_cfg, + load_mtp_weights, + copy_custom_model_files, + QUANT_CFG_CHOICES, + KV_QUANT_CFG_CHOICES, ) from modelopt.torch import quantization as mtq from modelopt.torch.quantization.config import need_calibration from modelopt.torch.utils.dataset_utils import get_max_batch_size from modelopt.torch.export import export_hf_checkpoint - from transformers import AutoTokenizer apply_patches() # ── Load model ── - # Use modelopt's get_model() instead of raw AutoModelForCausalLM.from_pretrained. - # The raw call OOMs during weight conversion (torch.cat on experts needs 31.5GB, - # only 25.9GB free). modelopt's loader handles max_memory/device_map properly. + # Use modelopt's get_model() — handles max_memory properly for 3TB model. + # Raw AutoModelForCausalLM.from_pretrained OOMs during expert weight conversion. print(f"\nLoading model from {model_path}...") t0 = time.time() - model = modelopt_get_model( + model = get_model( model_path, gpu_mem_percentage=GPU_MEM_PCT, trust_remote_code=True, @@ -272,17 +250,22 @@ def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, print(f"✓ Model loaded in {time.time()-t0:.0f}s") # ── Setup quantization config ── + # Same flow as hf_ptq's quantize_main() quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[QUANT]) quant_cfg = build_quant_cfg(QUANT, quant_cfg, None, None, None) if KV_CACHE_QUANT != "none": + enable_quant_kv_cache = True + print(f"✓ KV cache quantization: {KV_CACHE_QUANT}") quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant( quant_cfg, getattr(mtq, KV_QUANT_CFG_CHOICES[KV_CACHE_QUANT])["quant_cfg"], ) - print(f"✓ KV cache quantization: {KV_CACHE_QUANT}") + else: + enable_quant_kv_cache = False # ── Detect batch size ── + # Same as hf_ptq's quantize_main() print("\nDetecting max calibration batch size...") batch_size = get_max_batch_size( model, @@ -293,6 +276,7 @@ def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, print(f"✓ Using calibration batch_size={batch_size}") # ── Prepare dataloader ── + # Same args structure as hf_ptq args = argparse.Namespace( calib_size=[calib_size], calib_seq=calib_seq, @@ -310,23 +294,17 @@ def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, print(f"{'='*60}") t0 = time.time() - # _FORCE_AMAX_CPU is False during calibration — amax stays on GPU for - # fake quantization during the forward passes model = mtq.quantize(model, quant_cfg, forward_loop=calib_dataloader) print(f"✓ Quantization + calibration complete in {time.time()-t0:.0f}s") # ── IMMEDIATELY snapshot all _amax to CPU ── - # This is the critical defensive step. Right after mtq.quantize() returns, - # the _amax tensors are fresh and valid on GPU. We copy them to CPU NOW, - # before any other GPU operation has a chance to corrupt them. snapshots = snapshot_amax_to_cpu(model, amax_snapshot_path) # ── Force ALL quantizer state to CPU ── - # After snapshotting, force remaining GPU tensors to CPU too force_all_amax_to_cpu(model) - # ── Force ALL quantizer state to CPU ── + # ── Free GPU memory ── torch.cuda.empty_cache() gc.collect() @@ -346,12 +324,10 @@ def run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path=None print(f"EXPORTING → {export_dir}") print(f"{'='*60}") - # Ensure all quantizer state is on CPU force_all_amax_to_cpu(model) if amax_snapshot_path and os.path.exists(amax_snapshot_path): restore_amax_from_snapshot(model, amax_snapshot_path) - # Free GPU memory before export torch.cuda.empty_cache() gc.collect() @@ -383,6 +359,7 @@ def run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path=None def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir): """Load saved calibration state and run export only.""" + os.chdir(EXAMPLE_DIR) sys.path.insert(0, EXAMPLE_DIR) @@ -391,10 +368,10 @@ def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir) apply_patches() - from hf_ptq import get_model as modelopt_get_model, get_tokenizer + from example_utils import get_model, get_tokenizer - print(f"Loading model skeleton from {model_path}...") - model = modelopt_get_model( + print(f"Loading model from {model_path}...") + model = get_model( model_path, device="cpu", trust_remote_code=True, @@ -413,7 +390,6 @@ def run_validate(calib_save_path, amax_snapshot_path): """Validate saved calibration state — check amax values are valid.""" print(f"\nValidating calibration state...") - # Check amax snapshots if os.path.exists(amax_snapshot_path): snapshots = torch.load(amax_snapshot_path, map_location='cpu') n_total = len(snapshots) @@ -446,7 +422,6 @@ def run_validate(calib_save_path, amax_snapshot_path): else: print(f"✗ No amax snapshot found at {amax_snapshot_path}") - # Check full state dict if os.path.exists(calib_save_path): size_gb = os.path.getsize(calib_save_path) / (1024**3) print(f"\nCalibrated state: {calib_save_path} ({size_gb:.1f} GB)")