From d0fc5338fe7c21233f9f89ae44b61ca405b1db63 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 8 May 2026 05:50:16 +0000 Subject: [PATCH] model_opt_nvfp4_full: add use_seq_device_map, fix source for /bin/sh --- scripts/model_opt_nvfp4_full.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/model_opt_nvfp4_full.py b/scripts/model_opt_nvfp4_full.py index 3b51d5e..564183f 100644 --- a/scripts/model_opt_nvfp4_full.py +++ b/scripts/model_opt_nvfp4_full.py @@ -35,9 +35,10 @@ TP = 8 CALIB = 256 KV_CACHE_QUANT = "fp8_cast" -# No --use_seq_device_map (causes OOM on 2.8TB RAM with 782GB+ model) -# Use gpu_max_mem_percentage to keep model on GPU, reduce CPU RAM pressure -EXTRA_FLAGS = "--trust_remote_code --gpu_max_mem_percentage 0.9" +# 3TB BF16 model can't fit on 8×B200 VRAM (~1.4TB total) +# Use seq_device_map: loads model into CPU RAM, moves layers to GPU for forward passes +# 2.8TB RAM is enough for the 3TB model (with memory-mapped loading) +EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.9" # Output dir follows modelopt convention: __kv_ # We override the model name to make the strategy clear @@ -48,7 +49,7 @@ LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log" # ── Run ───────────────────────────────────────────────────────────────────── cmd = f"""cd {SCRIPT_DIR} && \\ -source /root/nvidia-meeting/venv/bin/activate && \\ +. /root/nvidia-meeting/venv/bin/activate && \\ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\ bash scripts/huggingface_example.sh \\ --model {MODEL} \\