Fix: patch load_calib_amax instead of amax property setter (can't patch readonly descriptor)

Also remove _FORCE_AMAX_CPU global — load_calib_amax patch handles it.
2026-05-09 08:04:03 +00:00
parent 94179ed9d0
commit f9bbef8e91
1 changed files with 14 additions and 36 deletions
--- a/scripts/quantize_nvfp4.py
+++ b/scripts/quantize_nvfp4.py
@@ -56,44 +56,28 @@ EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
 CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
 AMAX_SNAPSHOT_PATH = "/root/nvidia-meeting/v4_nvfp4_amax_snapshots.pt"

-# Flag: when True, force all new _amax writes to CPU
-_FORCE_AMAX_CPU = False
-

 def apply_patches():
    """Apply runtime patches for V4 compatibility and GPU tensor safety."""

    from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module

-    # ── Patch 1: Force _amax to CPU after calibration completes ──
+    # ── Patch 1: load_calib_amax — force _amax to CPU after calibration ──
    #
-    # The _amax property setter is called by load_calib_amax() at the end of
-    # calibration. By default it stores on GPU. We patch it so that when
-    # _FORCE_AMAX_CPU is True, _amax goes to CPU instead.
-    #
-    # During calibration (before the flag is set), _amax stays on GPU for
-    # fake quantization. After calibration, we set the flag and re-call
-    # load_calib_amax() to re-populate _amax on CPU.
+    # load_calib_amax() is called by max_calibrate() after the forward loop
+    # finishes. It writes _amax to GPU by default. We patch it so _amax
+    # goes to CPU instead, preventing GPU corruption during the long wait
+    # before export.
+    orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax

-    orig_amax_setter = tq_module.TensorQuantizer.amax.fset
+    def patched_load_calib_amax(self, *args, **kwargs):
+        orig_load_calib_amax(self, *args, **kwargs)
+        # After _amax is written, move it to CPU
+        if hasattr(self, '_amax') and self._amax is not None:
+            self._amax = self._amax.cpu()

-    def patched_amax_setter(self, value):
-        assert value is not None, "amax cannot be set to None."
-        if not isinstance(value, torch.Tensor):
-            value = torch.tensor(value)
-        if not hasattr(self, "_amax"):
-            if _FORCE_AMAX_CPU:
-                self.register_buffer("_amax", value.clone().detach().cpu())
-            else:
-                self.register_buffer("_amax", value.clone().detach())
-        else:
-            if self._amax.shape != value.shape:
-                raise RuntimeError("Changing shape when setting amax is not allowed.")
-            target = self._amax.cpu() if _FORCE_AMAX_CPU else self._amax
-            self._amax.data.copy_(value.clone().detach().to(target.device))
-
-    tq_module.TensorQuantizer.amax.fset = patched_amax_setter
-    print("✓ Patched TensorQuantizer.amax setter (CPU mode controlled by _FORCE_AMAX_CPU)")
+    tq_module.TensorQuantizer.load_calib_amax = patched_load_calib_amax
+    print("✓ Patched TensorQuantizer.load_calib_amax (force _amax to CPU)")

    # ── Patch 2: export_amax — CPU safety ──
    # If any _amax is still on GPU at export time, move it before reading.
@@ -342,10 +326,7 @@ def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path,
    # After snapshotting, force remaining GPU tensors to CPU too
    force_all_amax_to_cpu(model)

-    # ── Enable CPU mode for any future amax writes ──
-    _FORCE_AMAX_CPU = True
-
-    # ── Free GPU memory ──
+    # ── Force ALL quantizer state to CPU ──
    torch.cuda.empty_cache()
    gc.collect()

@@ -402,9 +383,6 @@ def run_export(model, tokenizer, model_path, export_dir, amax_snapshot_path=None

 def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir):
    """Load saved calibration state and run export only."""
-    global _FORCE_AMAX_CPU
-    _FORCE_AMAX_CPU = True  # Force CPU for any amax writes
-
    os.chdir(EXAMPLE_DIR)
    sys.path.insert(0, EXAMPLE_DIR)