diff --git a/scripts/model_opt_nvfp4_full.py b/scripts/model_opt_nvfp4_full.py index 0dc622f..d52cee2 100644 --- a/scripts/model_opt_nvfp4_full.py +++ b/scripts/model_opt_nvfp4_full.py @@ -40,6 +40,9 @@ KV_CACHE_QUANT = "fp8_cast" # 2.8TB RAM is enough for the 3TB model (with memory-mapped loading) EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.7" +# HF token for gated calibration datasets (nvidia/Nemotrons-Post-Training-Dataset-v2) +HF_TOKEN = "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" + # Output dir follows modelopt convention: __kv_ # We override the model name to make the strategy clear OUTPUT_NAME = f"DeepSeek-V4-Pro_NVFP4-{QUANT}_kv_{KV_CACHE_QUANT}" @@ -50,6 +53,7 @@ LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log" # ── Run ───────────────────────────────────────────────────────────────────── cmd = f"""cd {SCRIPT_DIR} && \\ . /root/nvidia-meeting/venv/bin/activate && \\ +HF_TOKEN={HF_TOKEN} \\ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\ bash scripts/huggingface_example.sh \\ --model {MODEL} \\