diff --git a/scripts/model_opt_nvfp4_full.py b/scripts/model_opt_nvfp4_full.py index d52cee2..92a8887 100644 --- a/scripts/model_opt_nvfp4_full.py +++ b/scripts/model_opt_nvfp4_full.py @@ -40,8 +40,8 @@ KV_CACHE_QUANT = "fp8_cast" # 2.8TB RAM is enough for the 3TB model (with memory-mapped loading) EXTRA_FLAGS = "--trust_remote_code --use_seq_device_map --gpu_max_mem_percentage 0.7" -# HF token for gated calibration datasets (nvidia/Nemotrons-Post-Training-Dataset-v2) -HF_TOKEN = "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" +# HF token for gated calibration datasets (nvidia/Nemotron-Post-Training-Dataset-v2) +HF_TOKEN = "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" # Output dir follows modelopt convention: __kv_ # We override the model name to make the strategy clear @@ -53,7 +53,9 @@ LOG_FILE = f"/root/nvidia-meeting/modelopt_{QUANT}.log" # ── Run ───────────────────────────────────────────────────────────────────── cmd = f"""cd {SCRIPT_DIR} && \\ . /root/nvidia-meeting/venv/bin/activate && \\ -HF_TOKEN={HF_TOKEN} \\ +export HF_TOKEN={HF_TOKEN} && \\ +export HUGGING_FACE_HUB_TOKEN={HF_TOKEN} && \\ +echo "HF_TOKEN=$HF_TOKEN" && \\ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\ bash scripts/huggingface_example.sh \\ --model {MODEL} \\ @@ -66,6 +68,7 @@ bash scripts/huggingface_example.sh \\ print(f"Running: {QUANT} quantization on {MODEL}") print(f"Output: {OUTPUT_NAME}") print(f"Log: {LOG_FILE}") +print(f"HF_TOKEN: {HF_TOKEN}") print(f"Command:\n{cmd}\n") ret = subprocess.call(cmd, shell=True)