model_opt_nvfp4_full: add use_seq_device_map, fix source for /bin/sh

This commit is contained in:
2026-05-08 05:50:16 +00:00
parent b70a04696e
commit d0fc5338fe

View File

@@ -35,9 +35,10 @@ TP = 8
CALIB = 256
KV_CACHE_QUANT = "fp8_cast"
# No --use_seq_device_map (causes OOM on 2.8TB RAM with 782GB+ model)
# Use gpu_max_mem_percentage to keep model on GPU, reduce CPU RAM pressure
EXTRA_FLAGS = "--trust_remote_code --gpu_max_mem_percentage 0.9"
# 3TB BF16 model can't fit on 8×B200 VRAM (~1.4TB total)
# Use seq_device_map: loads model into CPU RAM, moves layers to GPU for forward passes
# 2.8TB RAM is enough for the 3TB model (with memory-mapped loading)
EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.9"
# Output dir follows modelopt convention: <model>_<quant>_kv_<kv_quant>
# We override the model name to make the strategy clear
@@ -48,7 +49,7 @@ LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log"
# ── Run ─────────────────────────────────────────────────────────────────────
cmd = f"""cd {SCRIPT_DIR} && \\
source /root/nvidia-meeting/venv/bin/activate && \\
. /root/nvidia-meeting/venv/bin/activate && \\
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
bash scripts/huggingface_example.sh \\
--model {MODEL} \\