From d0fc5338fe7c21233f9f89ae44b61ca405b1db63 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 8 May 2026 05:50:16 +0000
Subject: [PATCH] model_opt_nvfp4_full: add use_seq_device_map, fix source for
 /bin/sh

---
 scripts/model_opt_nvfp4_full.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/scripts/model_opt_nvfp4_full.py b/scripts/model_opt_nvfp4_full.py
index 3b51d5e..564183f 100644
--- a/scripts/model_opt_nvfp4_full.py
+++ b/scripts/model_opt_nvfp4_full.py
@@ -35,9 +35,10 @@ TP = 8
 CALIB = 256
 KV_CACHE_QUANT = "fp8_cast"
 
-# No --use_seq_device_map (causes OOM on 2.8TB RAM with 782GB+ model)
-# Use gpu_max_mem_percentage to keep model on GPU, reduce CPU RAM pressure
-EXTRA_FLAGS = "--trust_remote_code --gpu_max_mem_percentage 0.9"
+# 3TB BF16 model can't fit on 8×B200 VRAM (~1.4TB total)
+# Use seq_device_map: loads model into CPU RAM, moves layers to GPU for forward passes
+# 2.8TB RAM is enough for the 3TB model (with memory-mapped loading)
+EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.9"
 
 # Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
 # We override the model name to make the strategy clear
@@ -48,7 +49,7 @@ LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
 
 # ── Run ─────────────────────────────────────────────────────────────────────
 cmd = f"""cd {SCRIPT_DIR} && \\
-source /root/nvidia-meeting/venv/bin/activate && \\
+. /root/nvidia-meeting/venv/bin/activate && \\
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
 bash scripts/huggingface_example.sh \\
     --model {MODEL} \\