From f65d4ab99f88686a7719ecc4b8b7daf39702ffed Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 10 May 2026 07:54:34 +0000 Subject: [PATCH] Run 11 SUCCESS: 881GB NVFP4 exported, add vLLM serve script --- README.md | 14 ++++++++-- scripts/serve_vllm.py | 59 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 scripts/serve_vllm.py diff --git a/README.md b/README.md index da40ce1..15a460d 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,19 @@ # DeepSeek V4 Pro → NVFP4 Quantization -Full NVFP4 quantization of DeepSeek V4 Pro on a single B200 node (8× B200, 2.7TB RAM, 13TB NVMe). Target: ~600GB. +Full NVFP4 quantization of DeepSeek V4 Pro on a single B200 node (8× B200, 2.7TB RAM, 13TB NVMe). **Result: 881GB NVFP4 (Run 11).** **Cost:** ~$161/run at $23/hr (7 hours each). Don't waste runs. +## ✅ Final Result (Run 11) + +- **Output:** `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4` — 881GB, 95 safetensors +- **Config:** `nvfp4` full quantization, 128 calib samples, `kv_cache_qformat=fp8_cast` +- **Total runtime:** ~7,783s (~2h10m end-to-end) +- **Peak GPU mem:** ~163GB per B200 +- **Amax snapshots:** 47,696 quantizers, 15.4MB +- **Calibrated state:** 721.4GB (insurance, can re-export with `--export-only`) +- A few experts (11, 83, 100, 112, 254) had uncalibrated amax — weight-derived fallback used (normal for sparse MoE with 256 experts) + ## Architecture We call modelopt's `hf_ptq.main()` directly — the same entry point the shell script uses. We don't rewrite the pipeline. We just: @@ -79,7 +89,7 @@ python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --valid | 8 | May 9 ~14:00 | `b2849a8` | 128 | ❌ Argparse crash | Wrong flag names (shell script names vs `hf_ptq.py` names) | Use `hf_ptq.py` flag names | | 9 | May 9 ~14:30 | `a300302` | 128 | ❌ TypeError | Skipped `__main__` post-parse conversions (`calib_size` still string, not int list) | Apply same conversions after `parse_args()` | | 10 | May 9 ~15:30 | `5a72da7` | 128 | ❌ Export crash (calib ✅) | `get_weight_scaling_factor` reads stale GPU weight → `cudaErrorIllegalAddress` | Patch `_export_quantized_weight` to force weight to CPU at entry point | -| 11 | May 9 ~22:50 | `07cd50e` | 128 | 🔄 Running | — | 8 patches covering full export chain | +| 11 | May 9 ~22:50 | `07cd50e` | 128 | ✅ **SUCCESS** | — | 8 patches covering full export chain | ### Key Lessons diff --git a/scripts/serve_vllm.py b/scripts/serve_vllm.py new file mode 100644 index 0000000..c7e3cdd --- /dev/null +++ b/scripts/serve_vllm.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server. + +Run from the venv on the B200 node: + source /root/nvidia-meeting/venv/bin/activate + python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py + +Or in the background: + nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \ + > /root/nvidia-meeting/vllm_serve.log 2>&1 & +""" + +import subprocess +import sys + +MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" + +# These flags are critical for V4 — do not change without understanding why: +# --trust-remote-code V4 needs custom modeling code +# --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization +# --block-size 256 V4 recommended block size +# --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE) +# --tensor-parallel-size 8 8× B200 +# --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops +# --attention_config FP4 indexer cache for V4 MLA attention +# --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell +# --tokenizer-mode deepseek_v4 — V4-specific tokenizer +# --tool-call-parser deepseek_v4 — native tool calling +# --enable-auto-tool-choice Auto tool choice for function calling +# --reasoning-parser deepseek_v4 — reasoning/thinking output parsing +# --speculative_config MTP speculative decoding (2 speculative tokens) + +cmd = [ + sys.executable, "-m", "vllm.entrypoints.openai.api_server", + "--model", MODEL, + "--trust-remote-code", + "--kv-cache-dtype", "fp8", + "--block-size", "256", + "--enable-expert-parallel", + "--tensor-parallel-size", "8", + "--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}', + "--attention_config.use_fp4_indexer_cache=True", + "--moe-backend", "deep_gemm_mega_moe", + "--tokenizer-mode", "deepseek_v4", + "--tool-call-parser", "deepseek_v4", + "--enable-auto-tool-choice", + "--reasoning-parser", "deepseek_v4", + "--speculative_config", '{"method":"mtp","num_speculative_tokens":2}', + "--host", "0.0.0.0", + "--port", "8000", +] + +print(f"Starting vLLM server for {MODEL}") +print(f"Command: {' '.join(cmd)}") +print(f"Log: /root/nvidia-meeting/vllm_serve.log") +print() + +sys.exit(subprocess.call(cmd))