diff --git a/scripts/quantize_nvfp4.py b/scripts/quantize_nvfp4.py
index 0ddb96c..6e2e42f 100644
--- a/scripts/quantize_nvfp4.py
+++ b/scripts/quantize_nvfp4.py
@@ -6,19 +6,16 @@ Runs the full ModelOpt PTQ pipeline in-process (not wrapping the shell script),
 saves model state after calibration (so we don't lose 6 hours of work to an
 export crash), and patches the export path to handle stale GPU tensors.
 
+Must be run from the modelopt example directory for imports:
+    cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
+    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
+
 Usage:
     # Full run (calibrate + export):
-    python3 scripts/quantize_nvfp4.py
+    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
 
     # Re-run export only (after a calibration save exists):
-    python3 scripts/quantize_nvfp4.py --export-only
-
-Pipeline:
-    1. Load BF16 model with sequential device map
-    2. Patch modelopt for V4 compatibility
-    3. Quantize + calibrate (5-6 hours)
-    4. SAVE model state to disk  ← checkpoint so export failures don't waste calibration
-    5. Export to HF safetensors
+    python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only
 """
 
 import argparse
@@ -42,8 +39,8 @@ GPU_MEM_PCT = 0.7
 
 HF_TOKEN = "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"
 
-# Output paths
-SCRIPT_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"  # needed for example_utils imports
+# Paths
+EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
 EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
 CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
 
@@ -51,9 +48,10 @@ CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
 def apply_patches():
     """Apply runtime patches for V4 compatibility."""
 
-    # 1. Patch quant_module.py for V4's ModuleList expert quantizers
     from modelopt.torch.quantization.nn import quant_module
+    from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module
 
+    # 1. Patch quant_module.py for V4's ModuleList expert quantizers
     orig_iter = quant_module._QuantFusedExperts.iter_weights_for_calibration
 
     def patched_iter_weights_for_calibration(self, **kwargs):
@@ -67,21 +65,33 @@ def apply_patches():
     quant_module._QuantFusedExperts.iter_weights_for_calibration = patched_iter_weights_for_calibration
     print("✓ Patched _QuantFusedExperts.iter_weights_for_calibration for V4 ModuleList")
 
-    # 2. Patch nvfp4_tensor.get_activation_scaling_factor to move amax to CPU first
+    # 2. Patch TensorQuantizer.export_amax to move _amax to CPU before reading
+    orig_export_amax = tq_module.TensorQuantizer.export_amax
+
+    def patched_export_amax(self):
+        """Move _amax to CPU before export to prevent CUDA illegal memory access
+        on tensors that have been sitting in VRAM for hours during calibration."""
+        if self.amax is not None and self.amax.is_cuda:
+            self._amax = self._amax.cpu()
+        return orig_export_amax(self)
+
+    tq_module.TensorQuantizer.export_amax = patched_export_amax
+    print("✓ Patched TensorQuantizer.export_amax (CPU safety)")
+
+    # 3. Patch NVFP4QTensor.get_activation_scaling_factor for graceful degradation
     from modelopt.torch.quantization.qtensor import nvfp4_tensor
 
     orig_get_asf = nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor
 
     @classmethod
     def patched_get_activation_scaling_factor(cls, quantizer):
-        """Move amax to CPU before export to avoid stale GPU tensor reads."""
+        """Move amax to CPU before export; clamp instead of assert on bad values."""
         if not quantizer.is_enabled:
             return None
 
         try:
             amax = quantizer.export_amax()
         except (torch.cuda.CudaError, RuntimeError) as e:
-            # GPU tensor is corrupted — try moving _amax to CPU first then retry
             print(f"  WARNING: export_amax() failed ({e}), attempting CPU recovery...")
             if hasattr(quantizer, '_amax') and quantizer._amax is not None:
                 quantizer._amax = quantizer._amax.cpu()
@@ -90,12 +100,10 @@ def apply_patches():
         if amax is None:
             return None
 
-        # Move to CPU for safety
         amax = amax.cpu()
-
         activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0)
 
-        # Replace hard assert with warning + clamp (invalid values from GPU corruption)
+        # Replace hard assert with warning + clamp
         if not torch.all(activation_scaling_factor > 0):
             n_bad = (activation_scaling_factor <= 0).sum().item()
             n_total = activation_scaling_factor.numel()
@@ -107,37 +115,25 @@ def apply_patches():
     nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor
     print("✓ Patched NVFP4QTensor.get_activation_scaling_factor (CPU safety + graceful degradation)")
 
-    # 3. Patch tensor_quantizer.export_amax to move _amax to CPU before reading
-    from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module
-
-    orig_export_amax = tq_module.TensorQuantizer.export_amax
-
-    def patched_export_amax(self):
-        """Move _amax to CPU before export to prevent CUDA illegal memory access."""
-        if self.amax is not None and self.amax.is_cuda:
-            self._amax = self._amax.cpu()
-        return orig_export_amax(self)
-
-    tq_module.TensorQuantizer.export_amax = patched_export_amax
-    print("✓ Patched TensorQuantizer.export_amax (CPU safety)")
-
 
 def move_quantizers_to_cpu(model):
     """Move all quantizer amax tensors to CPU to prevent stale GPU reads during export."""
+    from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
     count = 0
     for name, module in model.named_modules():
-        if hasattr(module, '_amax') and module._amax is not None:
-            if module._amax.is_cuda:
-                module._amax = module._amax.cpu()
-                count += 1
+        if isinstance(module, TensorQuantizer):
+            if hasattr(module, '_amax') and module._amax is not None:
+                if module._amax.is_cuda:
+                    module._amax = module._amax.cpu()
+                    count += 1
     print(f"✓ Moved {count} quantizer _amax tensors to CPU")
 
 
 def save_calibrated_state(model, path):
-    """Save model state dict + quantizer metadata after calibration.
+    """Save model state dict after calibration.
 
-    This is the insurance policy: if export crashes, we can reload
-    and retry export without re-running 6 hours of calibration.
+    Insurance policy: if export crashes, we can reload and retry
+    without re-running 6 hours of calibration.
     """
     print(f"\n{'='*60}")
     print(f"SAVING CALIBRATED STATE → {path}")
@@ -157,29 +153,31 @@ def save_calibrated_state(model, path):
     size_gb = os.path.getsize(path) / (1024**3)
     print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)")
     print(f"  Path: {path}")
-    print(f"  This allows re-running export without re-calibrating.\n")
+    print(f"  Re-run with --export-only to retry export without recalibrating.\n")
 
 
-def load_calibrated_state(model, path):
-    """Load previously saved calibrated state into model."""
-    print(f"Loading calibrated state from {path}...")
-    state = torch.load(path, map_location='cpu')
-    model.load_state_dict(state['model_state_dict'])
-    print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")
-
-
-def run_calibration(model_path, export_dir, calib_save_path):
+def run_calibration(model_path, export_dir, calib_save_path, calib_size, calib_seq):
     """Full pipeline: load → quantize → calibrate → save → export."""
 
-    # Must be in the example dir for the relative imports (example_utils, etc.)
-    os.chdir(SCRIPT_DIR)
-    sys.path.insert(0, SCRIPT_DIR)
+    # Must be in the example dir for relative imports (example_utils, etc.)
+    os.chdir(EXAMPLE_DIR)
+    sys.path.insert(0, EXAMPLE_DIR)
 
-    from hf_ptq import get_model, get_tokenizer, make_calib_dataloader, pre_quantize
+    # Set HF token for gated datasets
+    os.environ["HF_TOKEN"] = HF_TOKEN
+    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
+
+    # These imports depend on the example dir being in sys.path
+    from hf_ptq import (
+        get_model, get_tokenizer, make_calib_dataloader,
+        build_quant_cfg, load_mtp_weights, copy_custom_model_files,
+        QUANT_CFG_CHOICES,
+    )
     from modelopt.torch import quantization as mtq
-    from modelopt.torch.quantization.config import need_calibration, QUANT_CFG_CHOICES
+    from modelopt.torch.quantization.config import need_calibration
     from modelopt.torch.utils.dataset_utils import get_max_batch_size
-    from hf_ptq import build_quant_cfg
+    from modelopt.torch.export import export_hf_checkpoint
+    from transformers import AutoModelForCausalLM, AutoTokenizer
 
     # Apply patches before loading model
     apply_patches()
@@ -188,14 +186,6 @@ def run_calibration(model_path, export_dir, calib_save_path):
     print(f"\nLoading model from {model_path}...")
     t0 = time.time()
 
-    # Set HF token for gated datasets
-    os.environ["HF_TOKEN"] = HF_TOKEN
-    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
-
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    from accelerate import infer_auto_device_map
-
-    # Load with sequential device map (model doesn't fit in GPU VRAM alone)
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         trust_remote_code=True,
@@ -203,15 +193,13 @@ def run_calibration(model_path, export_dir, calib_save_path):
         device_map="sequential",
         offload_folder="offload",
     )
-    print(f"✓ Model loaded in {time.time()-t0:.0f}s")
-
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(f"✓ Model loaded in {time.time()-t0:.0f}s")
 
     # ── Setup quantization config ──
     quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[QUANT])
     quant_cfg = build_quant_cfg(QUANT, quant_cfg, None, None, None)
 
-    # KV cache quantization
     if KV_CACHE_QUANT != "none":
         quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
             quant_cfg,
@@ -223,27 +211,28 @@ def run_calibration(model_path, export_dir, calib_save_path):
     print("\nDetecting max calibration batch size...")
     batch_size = get_max_batch_size(
         model,
-        max_sample_length=CALIB_SEQ,
+        max_sample_length=calib_seq,
         sample_memory_usage_ratio=1.1,
     )
-    batch_size = min(batch_size, CALIB_SIZE)
+    batch_size = min(batch_size, calib_size)
     print(f"✓ Using calibration batch_size={batch_size}")
 
     # ── Prepare dataloader ──
+    # Build a minimal args namespace for make_calib_dataloader
+    args = argparse.Namespace(
+        calib_size=[calib_size],
+        calib_seq=calib_seq,
+        calib_dataset="",
+        batch_size=batch_size,
+        calib_batch_size=0,
+    )
     calib_dataloader, _ = make_calib_dataloader(
-        argparse.Namespace(
-            calib_size=[CALIB_SIZE],
-            calib_seq=CALIB_SEQ,
-            calib_dataset="",
-            batch_size=batch_size,
-            calib_batch_size=0,
-        ),
-        model, None, tokenizer, torch.device("cuda"), None,
+        args, model, None, tokenizer, torch.device("cuda"), None,
     )
 
     # ── Quantize + Calibrate ──
     print(f"\n{'='*60}")
-    print(f"QUANTIZING: {QUANT} with {CALIB_SIZE} calibration samples")
+    print(f"QUANTIZING: {QUANT} with {calib_size} calibration samples")
     print(f"{'='*60}")
     t0 = time.time()
 
@@ -261,19 +250,17 @@ def run_calibration(model_path, export_dir, calib_save_path):
 def run_export(model, tokenizer, model_path, export_dir):
     """Export the quantized model to HF safetensors format."""
     from modelopt.torch.export import export_hf_checkpoint
-    from hf_ptq import load_mtp_weights
+    from hf_ptq import load_mtp_weights, copy_custom_model_files
 
     print(f"\n{'='*60}")
     print(f"EXPORTING → {export_dir}")
     print(f"{'='*60}")
 
-    # Move quantizers to CPU before export
     move_quantizers_to_cpu(model)
 
     t0 = time.time()
 
     try:
-        # Load MTP weights if present
         mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path)
         if mtp_layer_prefixes:
             model._mtp_layer_prefixes = mtp_layer_prefixes
@@ -284,15 +271,10 @@ def run_export(model, tokenizer, model_path, export_dir):
             extra_state_dict=mtp_state_dict,
         )
 
-        # Save tokenizer
         tokenizer.save_pretrained(export_dir)
-
-        # Copy custom model files
-        from hf_ptq import copy_custom_model_files
         copy_custom_model_files(model_path, export_dir, True)
 
-        elapsed = time.time() - t0
-        print(f"\n✓ Export complete in {elapsed:.0f}s → {export_dir}")
+        print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}")
 
     except Exception as e:
         print(f"\n✗ EXPORT FAILED: {e}")
@@ -303,30 +285,31 @@ def run_export(model, tokenizer, model_path, export_dir):
 
 def run_export_only(calib_save_path, model_path, export_dir):
     """Load previously saved calibration state and run export only."""
-    os.chdir(SCRIPT_DIR)
-    sys.path.insert(0, SCRIPT_DIR)
+    os.chdir(EXAMPLE_DIR)
+    sys.path.insert(0, EXAMPLE_DIR)
+
+    os.environ["HF_TOKEN"] = HF_TOKEN
+    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
 
     apply_patches()
 
     from transformers import AutoModelForCausalLM, AutoTokenizer
 
-    os.environ["HF_TOKEN"] = HF_TOKEN
-    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
-
-    # Load a fresh model (we just need the architecture, then overlay the state)
     print(f"Loading model skeleton from {model_path}...")
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         trust_remote_code=True,
         torch_dtype=torch.bfloat16,
-        device_map="cpu",  # Don't load onto GPU yet
+        device_map="cpu",
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
     # Load the calibrated state
-    load_calibrated_state(model, calib_save_path)
+    print(f"Loading calibrated state from {calib_save_path}...")
+    state = torch.load(calib_save_path, map_location='cpu')
+    model.load_state_dict(state['model_state_dict'])
+    print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")
 
-    # Export
     run_export(model, tokenizer, model_path, export_dir)
 
 
@@ -348,7 +331,8 @@ def main():
             sys.exit(1)
         run_export_only(args.calib_save, args.model, args.export_dir)
     else:
-        run_calibration(args.model, args.export_dir, args.calib_save)
+        run_calibration(args.model, args.export_dir, args.calib_save,
+                        args.calib_size, args.calib_seq)
 
 
 if __name__ == "__main__":