Compare commits
128 Commits
nvidia-mod
...
mega-moe-n
| Author | SHA1 | Date | |
|---|---|---|---|
| f08bcd456b | |||
| 2bdda36bb7 | |||
| fa825c16b9 | |||
| dbf1d11f9f | |||
| ef3edb3481 | |||
| b74dc7121a | |||
| 7bbbdbcc79 | |||
| 2e674f87c1 | |||
| 5d127d8294 | |||
| 52cf3f2e25 | |||
| 02decb486e | |||
| 48f1f9dc5e | |||
| 5cabc1f7d9 | |||
| 25a2d4e6ad | |||
| d88ea9842b | |||
| 91d7d9bad7 | |||
| d68e113af1 | |||
| f0652693a6 | |||
| 054792c84e | |||
| de055b1e77 | |||
| 307574bc91 | |||
| fcd6de0a60 | |||
| d4c557fddc | |||
| 28afc2406b | |||
| 787d427847 | |||
| 8737fd57c0 | |||
| 52c3aefe73 | |||
| ca1d306890 | |||
| b8f95ffad3 | |||
| 5840291ea3 | |||
| 5ea5b579c3 | |||
| 74af9984f6 | |||
| a36bf47f11 | |||
| 27dbf2850f | |||
| 3d1f3de190 | |||
| 79d866995f | |||
| c85b84b0fe | |||
| 01cfd02759 | |||
| 076d325c97 | |||
| 8dc917c498 | |||
| 17ba5a9d7b | |||
| 7a4403fa98 | |||
| 0fd2d4f078 | |||
| 50a945bde4 | |||
| 48b905406a | |||
| 35f6b66678 | |||
| f32d6b5b48 | |||
| cd24182e36 | |||
| 8ae2214bad | |||
| c4891e9ee2 | |||
| 436109081c | |||
| 5faf9916eb | |||
| 220649c188 | |||
| cfead0012d | |||
| 8cb23bdb78 | |||
| ff579c9767 | |||
| 1da40c53da | |||
| b532742530 | |||
| b1cf4232ee | |||
| a2e9b5f17f | |||
| c8564caf9d | |||
| 7c8c6cd67f | |||
| cffb373759 | |||
| 983ba02c5b | |||
| f0471ed1c2 | |||
| c234190a80 | |||
| e963325b61 | |||
| 7e2f219259 | |||
| cf54b4755a | |||
| 7febeaeb71 | |||
| 26aaaba4a2 | |||
| 67f9086a26 | |||
| 02b8ea536f | |||
| 653e2d7a50 | |||
| db16be8e5d | |||
| 6fd03a0aa0 | |||
| d88793dee6 | |||
| 30608e3834 | |||
| 0d74b97fb2 | |||
| f65d4ab99f | |||
| eb80bd6f80 | |||
| 07cd50e823 | |||
| efc111a11f | |||
| ce9056d259 | |||
| 5a72da7193 | |||
| 8612914169 | |||
| a300302486 | |||
| 1a36a655ea | |||
| b2849a8944 | |||
| a70593d886 | |||
| 25b4d8da06 | |||
| d1e15178b2 | |||
| 6c1bff6997 | |||
| 86dd8df302 | |||
| 99f861f48a | |||
| f9bbef8e91 | |||
| 94179ed9d0 | |||
| 03c10ab3b6 | |||
| 9438af5a8c | |||
| d7593fc1dd | |||
| 6eaba26914 | |||
| 3907838409 | |||
| 382c1d872f | |||
| 9291165ba0 | |||
| a0bacb3cf6 | |||
| 04304fdae6 | |||
| 50348989b2 | |||
| 24e3b3745d | |||
| b08afea425 | |||
| a2370006f7 | |||
| f1d21900ea | |||
| ca9a4f5eaa | |||
| eeba101cc4 | |||
| 075da675dc | |||
| 36e1342270 | |||
| 3d38e1d5cd | |||
| d0fc5338fe | |||
| b70a04696e | |||
| f63eed5cfd | |||
| f8533197f2 | |||
| b5d569218c | |||
| db6beb5b76 | |||
| cbfc5a9afb | |||
| b5d14aa8b8 | |||
| 6008cf128d | |||
| a7664aee7d | |||
| 7a3b81e833 | |||
| ef89ceffbd |
30
Dockerfile
Normal file
30
Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
||||
# DeepSeek V4 NVFP4 vLLM + DeepGEMM Mega MoE
|
||||
# Extends the vLLM dream-build container with our custom DeepGEMM kernel
|
||||
|
||||
FROM atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
|
||||
|
||||
# Install build essentials
|
||||
RUN apt-get update && apt-get install -y git screen cmake && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Clone and build DeepGEMM with NVFP4 mega_moe kernel
|
||||
# CACHE_BUSTER: increment to force fresh clone
|
||||
RUN git clone -b nvfp4-mega-moe https://sweetapi.com/biondizzle/DeepGEMM.git /root/DeepGEMM && PATCH_CACHE_BUSTER=70
|
||||
|
||||
# Build DeepGEMM with proper CUDA/NVRTC paths
|
||||
ENV CPATH="/usr/local/lib/python3.12/dist-packages/flashinfer/data/cutlass/include:/usr/local/lib/python3.12/dist-packages/nvidia/cu13/include:/usr/local/cuda-13.0/include:${CPATH}"
|
||||
ENV PYTHONPATH="/root/DeepGEMM:${PYTHONPATH}"
|
||||
# NVRTC lives in the pip nvidia/cu13 package, but the linker expects it in cuda/lib64
|
||||
# Create a symlink so -lnvrtc resolves
|
||||
RUN ln -sf /usr/local/lib/python3.12/dist-packages/nvidia/cu13/lib/libnvrtc.so.13 /usr/local/cuda/lib64/libnvrtc.so && PATCH_CACHE_BUSTER=70
|
||||
RUN cd /root/DeepGEMM && python3 setup.py build_ext --inplace && PATCH_CACHE_BUSTER=69
|
||||
|
||||
# Bust cache for patch changes — ARG before COPY ensures layer invalidation
|
||||
ARG PATCH_CACHE_BUSTER=70
|
||||
# Copy our DeepSeek V4 patch over vLLM's model file
|
||||
COPY patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py
|
||||
# Copy the NVFP4 staging kernel (BF16→E2M1+UE4M3 quantization for activations)
|
||||
COPY patches/staging_kernel.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/staging_kernel.py
|
||||
|
||||
# Verify everything imports
|
||||
RUN python3 -c "import deep_gemm; print('DeepGEMM NVFP4 OK')" && \
|
||||
python3 -c "import vllm; print('vLLM OK')"
|
||||
275
README.md
275
README.md
@@ -1,75 +1,216 @@
|
||||
# DeepSeek V4 Pro → NVFP4 via NVIDIA Model Optimizer
|
||||
# DeepSeek V4 Pro → NVFP4 Quantization + vLLM Serving
|
||||
|
||||
Fallback quantization path using NVIDIA's official Model Optimizer (`nvidia-modelopt`) PTQ pipeline.
|
||||
Full NVFP4 quantization of DeepSeek V4 Pro and vLLM serving on 8× NVIDIA B200 GPUs.
|
||||
|
||||
## Why this branch
|
||||
## Quick Status
|
||||
|
||||
Path A (custom streaming FP8→NVFP4) is weight-only W4A16. If it doesn't produce good enough accuracy, NVIDIA's Model Optimizer provides data-driven calibration with proper activation scales, and is the officially supported path for DeepSeek V3/V4 NVFP4.
|
||||
| Component | Status |
|
||||
|-----------|--------|
|
||||
| NVFP4 Quantization | ✅ 881GB (Run 11), modelopt 0.45.0.dev64 |
|
||||
| Weight Loading | ✅ 95 safetensors shards, all 8 TP ranks |
|
||||
| Dequant Verification | ✅ Bit-exact match against official dequant (0.0 relative error) |
|
||||
| NVFP4→FP8 Conversion (wo_a) | ✅ DeepGEMM block-scale format |
|
||||
| NVFP4→BF16 Dequantization | ✅ 305 attn/shared, 91 compressor layers |
|
||||
| Compressor Reconstruction | ✅ Separate kv_proj/gate_proj → fused_wkv_wgate |
|
||||
| MoE Expert Serving (MegaMoE) | 🔧 Kernel builds & runs on sm_100a, debugging illegal CUDA access |
|
||||
| Output Quality | 🔧 Under investigation |
|
||||
|
||||
## What's here
|
||||
## B200 Node
|
||||
|
||||
- **IP**: `45.76.247.107`
|
||||
- **User**: `root`
|
||||
- **Password**: see `.env`
|
||||
- **GPUs**: 8× NVIDIA B200 (SM100a)
|
||||
- **Model weights**: `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4/`
|
||||
- **BF16 reference**: `/root/nvidia-meeting/DeepSeek-V4-Pro-BF16/`
|
||||
|
||||
## Repositories
|
||||
|
||||
| Repo | Branch | Purpose |
|
||||
|------|--------|---------|
|
||||
| `deepseek-v4-quant` | `modelopt-nvfp4` | Main repo: patches, quantize, serve scripts |
|
||||
| `DeepGEMM` | `nvfp4-mega-moe` | NVFP4 mega_moe kernel fork |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
DeepSeek V4 Pro (1.2T params, 61 layers)
|
||||
├── MLA Attention (61 layers)
|
||||
│ ├── fused_wqa_wkv → BF16 (UnquantizedLinearMethod)
|
||||
│ ├── wo_a → FP8 (DeepGEMM block-scale, BMM einsum)
|
||||
│ ├── wo_b → BF16 (UnquantizedLinearMethod)
|
||||
│ └── compressor.fused_wkv_wgate → BF16 (reconstructed from NVFP4)
|
||||
├── MoE Experts (256 experts per layer, 61 layers)
|
||||
│ └── MegaMoE path → NVFP4 (DeepGEMM mxf4nvf4, native block16)
|
||||
└── Shared Expert → FP8 (Fp8LinearMethod, DeepGEMM)
|
||||
```
|
||||
|
||||
## NVFP4 Format (Confirmed)
|
||||
|
||||
| Field | Format | Notes |
|
||||
|-------|--------|-------|
|
||||
| Weights | E2M1 packed uint8 | 2 values per byte |
|
||||
| Block scales | `torch.float8_e4m3fn` (UE4M3) | Standard NVFP4 spec, group_size=16 |
|
||||
| Global scales | `torch.float32` (weight_scale_2) | **Scalar per expert** (`torch.Size([])`) |
|
||||
| Dequant | `value = packed_E2M1 * block_scale * global_scale` | Block scale range [0, 448] |
|
||||
|
||||
**Key finding**: The checkpoint stores block scales as `torch.float8_e4m3fn` (UE4M3), NOT UE8M0.
|
||||
`.to(torch.float32)` is the correct conversion. The shift-by-23 trick was wrong — it was
|
||||
applying an E8M0→float conversion to E4M3 bytes, producing garbage.
|
||||
|
||||
## Dequant Verification
|
||||
|
||||
We verified the dequant path is bit-exact against the official reference:
|
||||
|
||||
```python
|
||||
W_bf16 = dequantize_fp4_weight(W_int, S)
|
||||
y_ours = W_bf16 @ x.bfloat16()
|
||||
y_ref = official_expert_forward(W_int, S, x)
|
||||
print((y_ours - y_ref).abs().max() / y_ref.abs().mean())
|
||||
```
|
||||
|
||||
Result:
|
||||
```
|
||||
Max abs diff: 0.00000000
|
||||
Mean abs diff: 0.00000000
|
||||
Relative error: 0.000000
|
||||
Matmul max diff: 0.00000000
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
### 1. Quantize
|
||||
|
||||
```bash
|
||||
# On B200 node, in screen
|
||||
screen -S quantize
|
||||
cd /root/nvidia-meeting
|
||||
bash run_quantize_nvfp4.sh
|
||||
# ~7 hours, $161 per run
|
||||
```
|
||||
|
||||
### 2. Build Container
|
||||
|
||||
```bash
|
||||
# From this repo
|
||||
bash build_push.sh
|
||||
# Always build in screen: screen -S build
|
||||
```
|
||||
|
||||
The Dockerfile:
|
||||
1. Extends `atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build`
|
||||
2. Clones DeepGEMM (`nvfp4-mega-moe` branch) and builds
|
||||
3. Copies `patches/deepseek_v4.py` over vLLM's model file
|
||||
|
||||
### 3. Serve
|
||||
|
||||
```bash
|
||||
# On B200 node
|
||||
cd /root/nvidia-meeting
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker logs -f nvidia-meeting-vllm-1
|
||||
|
||||
# Test
|
||||
curl http://localhost:8000/v1/models
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "/model", "messages": [{"role": "user", "content": "Hello"}], "max_tokens": 50}'
|
||||
```
|
||||
|
||||
### vLLM Flags
|
||||
|
||||
```
|
||||
--trust-remote-code
|
||||
--kv-cache-dtype fp8
|
||||
--block-size 256
|
||||
--enable-expert-parallel
|
||||
--tensor-parallel-size 8
|
||||
--compilation-config {"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
||||
--attention_config.use_fp4_indexer_cache=True
|
||||
--tokenizer-mode deepseek_v4
|
||||
--tool-call-parser deepseek_v4
|
||||
--enable-auto-tool-choice
|
||||
--reasoning-parser deepseek_v4
|
||||
--speculative_config {"method":"mtp","num_speculative_tokens":2}
|
||||
```
|
||||
|
||||
## NVFP4 Mega MoE Kernel
|
||||
|
||||
### What We Built
|
||||
|
||||
A native NVFP4 mega_moe kernel in our DeepGEMM fork. Weights stay in E2M1 packed format
|
||||
and use `kind::mxf4nvf4.block_scale.scale_vec::4X` MMA directly on SM100a (B200).
|
||||
|
||||
**This is novel — NVIDIA has not done NVFP4→vLLM integration.**
|
||||
|
||||
### Kernel Architecture
|
||||
|
||||
| Parameter | Value |
|
||||
|-----------|-------|
|
||||
| PTX instruction | `tcgen05.mma.kind::mxf4nvf4.block_scale.scale_vec::4X` |
|
||||
| kGranK | 16 (NVFP4 native block_size) |
|
||||
| Weight format | E2M1 packed uint8 (unchanged from checkpoint) |
|
||||
| Block scales | UE4M3 (float8_e4m3fn), native — no conversion needed |
|
||||
| Global scales | Folded into block scales before packing |
|
||||
| Instruction desc | `float_ue4m3_t` |
|
||||
| SF layout | block16, scale_vec::4X |
|
||||
| UTCCP stride | i*8 (4X layout) |
|
||||
| kNumSFUint32 | kHidden / 64 (4 UE4M3 per int32) |
|
||||
| recipe | (1, 1, 16) |
|
||||
| Target arch | `sm_100a` (the `a` suffix is **required**) |
|
||||
|
||||
### Python API
|
||||
|
||||
- `fp8_nvfp4_mega_moe()` — entry point, recipe=(1,1,16)
|
||||
- `transform_nvfp4_weights_for_mega_moe()` — fold global scales, pack UE4M3→int32, TMA-align
|
||||
- `get_symm_buffer_for_nvfp4_mega_moe()` — 2× SF buffer vs MXFP4
|
||||
|
||||
### C++ Bindings
|
||||
|
||||
- `csrc/apis/mega_nvfp4.hpp` — kGranK=16, SF stride K/16, packed E2M1 hidden/2
|
||||
- `csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp` — host-side TMA descriptors
|
||||
- `deep_gemm/include/deep_gemm/impls/sm100_fp8_nvfp4_mega_moe.cuh` — kernel
|
||||
|
||||
### Full FP4 Pipeline
|
||||
|
||||
The `mxf4nvf4` instruction is FP4×FP4 — both activations (A) and weights (B) must be E2M1 packed.
|
||||
A Triton staging kernel quantizes BF16 activations → E2M1 packed uint8 + UE4M3 block16 scales
|
||||
before the GEMM. The L1 epilogue outputs UE4M3 activation scales directly (float→e4m3 cast).
|
||||
|
||||
## Bugs Found and Fixed
|
||||
|
||||
| # | Bug | Impact | Fix |
|
||||
|---|-----|--------|-----|
|
||||
| 1 | DeepGEMM `sf.dim()` crash | Server crash | `deepgemm_post_process_fp8_weight_block` for block-scale format |
|
||||
| 2 | Block scale dtype `float8_e4m3fn` | Crash | Use `float32` for block-scale tensor |
|
||||
| 3 | Missing `deepgemm_post_process` args | Crash | Pass `quant_block_shape`, `use_e8m0` |
|
||||
| 4 | Compressor indexer shape mismatch | Crash | `.indexer.` sub-path in checkpoint keys |
|
||||
| 5 | All-ones block scale | Garbage output | `torch.full(..., fp8_scale)` not `torch.ones` |
|
||||
| 6 | `fused_skip_regex` skipping q_b/o_a/o_b scales | Garbage output | Remove non-fused scale entries from skip list |
|
||||
| 7 | UE8M0 shift-by-23 applied to E4M3 scales | Garbled output | Checkpoint is standard UE4M3 — use `.to(float32)` (shift-by-23 was wrong) |
|
||||
| 8 | wo_a BF16→NVFP4 on-the-fly used UE8M0 encoding | Scrambled attention | Produce UE4M3 directly: `.clamp(0, 448).to(float8_e4m3fn)` |
|
||||
| 9 | FP8 activations fed to mxf4nvf4 (FP4×FP4 instruction) | Crash/garbled | Full FP4 pipeline: activations are E2M1 packed + UE4M3 scales |
|
||||
| 10 | Staging kernel SF pack: shift ≥32 is UB | Half the activation scales zeroed | Split into 2 int32 writes per k_block (groups 0-3, 4-7) |
|
||||
| 11 | Staging kernel wrote unpacked E2M1 (1 byte/elem) into packed buffer | 2× buffer overflow | Pack even/odd nibble pairs, write BLOCK_K//2 bytes |
|
||||
| 12 | `compute-sanitizer` build running during debug | Slow (50×), masking timing | Remove sanitizer, rebuild |
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
| --- | --- |
|
||||
| `quantize_modelopt.py` | PTQ via `nvidia-modelopt` with `NVFP4_EXPERTS_ONLY` config |
|
||||
|------|---------|
|
||||
| `patches/deepseek_v4.py` | Main patch: NVFP4 weight loading, dequant, staging kernel, MegaMoE |
|
||||
| `patches/staging_kernel.py` | Reference copy of Triton staging kernel (live copy is in deepseek_v4.py) |
|
||||
| `scripts/dequant_fp8_to_bf16.py` | BF16 dequantization utility |
|
||||
| `scripts/quantize_nvfp4.py` | NVFP4 quantization runner |
|
||||
| `scripts/serve_vllm.py` | Standalone vLLM server launcher |
|
||||
| `Dockerfile` | Container build (extends dream-build with DeepGEMM + patch) |
|
||||
| `docker-compose.yml` | Production serve config |
|
||||
| `build_push.sh` | Build, push to CR, update docker-compose |
|
||||
|
||||
## Quantization config
|
||||
## HARD RULES
|
||||
|
||||
Using `nvfp4_experts_only` — NVIDIA's recommended config for MoE models. This quantizes only the expert MLP layers (`mlp.experts` / `block_sparse_moe`) while keeping attention QKV projections in higher precision. Options:
|
||||
|
||||
- `nvfp4_experts_only` — Experts only (recommended for MoE)
|
||||
- `nvfp4_mlp_only` — All MLP layers (experts + shared)
|
||||
- `nvfp4` — Full model NVFP4 (riskier for attention)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
```bash
|
||||
# Use the TensorRT-LLM docker if possible:
|
||||
# docker run --gpus all -it nvcr.io/nvidia/tensorrt-llm/release:1.2.0 bash
|
||||
|
||||
# Otherwise pip install:
|
||||
pip install -U "nvidia-modelopt[hf]"
|
||||
pip install compressed-tensors fire flash-attn transformers_stream_generator zstandard
|
||||
# Note: requires transformers<5.0 for modelopt compatibility
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# On the B200 node (8× B200, 2.7 TB RAM)
|
||||
cd /root/nvidia-meeting
|
||||
source venv/bin/activate
|
||||
|
||||
# Using BF16 source weights (preferred for modelopt calibration)
|
||||
python quantize_modelopt.py \
|
||||
--model /root/nvidia-meeting/DeepSeek-V4-Pro \
|
||||
--export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \
|
||||
--qformat nvfp4_experts_only \
|
||||
--tp 8 \
|
||||
--calib_size 256
|
||||
|
||||
# Using FP8 source (modelopt handles dequant internally)
|
||||
python quantize_modelopt.py \
|
||||
--model /root/nvidia-meeting/DeepSeek-V4-Pro-FP8 \
|
||||
--export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt-fp8src \
|
||||
--qformat nvfp4_experts_only \
|
||||
--tp 8 \
|
||||
--calib_size 256
|
||||
```
|
||||
|
||||
## Low-memory options
|
||||
|
||||
If you hit OOM during calibration:
|
||||
|
||||
- `--use_seq_device_map` — sequential device mapping across GPUs
|
||||
- `--low_memory_mode` — compress weights before calibration (FP8/NVFP4 only)
|
||||
|
||||
## Output
|
||||
|
||||
Exports a **Unified HuggingFace checkpoint** compatible with:
|
||||
- TensorRT-LLM (PyTorch and C++ backends)
|
||||
- vLLM
|
||||
- SGLang
|
||||
|
||||
## Expected runtime
|
||||
|
||||
24-72 hours for full calibration on 8× B200 with 256 calibration samples.
|
||||
- **NEVER convert DeepSeek MoE experts to MXFP4.** Experts stay in NVFP4. Period.
|
||||
- **The checkpoint is UE4M3 (float8_e4m3fn), NOT UE8M0.** Never use shift-by-23 on these bytes.
|
||||
- **Target `sm_100a`, not `sm_100`.** The `a` suffix is required for mxf4nvf4 instructions.
|
||||
|
||||
35
docker-compose.yml
Normal file
35
docker-compose.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
services:
|
||||
vllm:
|
||||
#image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest
|
||||
build:
|
||||
context: .
|
||||
pull_policy: always
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- OMP_NUM_THREADS=128
|
||||
#- VLLM_USE_FLASHINFER_MOE_FP4=1 # What the fuck is this!?
|
||||
command:
|
||||
- /model
|
||||
- --trust-remote-code
|
||||
#- --kv-cache-dtype=fp8 # maybe we just let it figure its own shit out
|
||||
#- --block-size=256
|
||||
- --enable-expert-parallel
|
||||
- --tensor-parallel-size=8
|
||||
- --enforce-eager
|
||||
#- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
||||
#- --attention_config.use_fp4_indexer_cache=True
|
||||
- --tokenizer-mode=deepseek_v4
|
||||
#- --speculative_config={"method":"mtp","num_speculative_tokens":2}
|
||||
- --host=0.0.0.0
|
||||
- --port=8000
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
volumes:
|
||||
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
||||
|
||||
2364
patches/deepseek_v4.py
Normal file
2364
patches/deepseek_v4.py
Normal file
File diff suppressed because it is too large
Load Diff
270
patches/staging_kernel.py
Normal file
270
patches/staging_kernel.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
NVFP4 staging kernel — full FP4 (E2M1) activations + UE4M3 block16 scales.
|
||||
|
||||
The mxf4nvf4 PTX instruction requires BOTH A and B to be FP4 (E2M1 packed).
|
||||
This kernel quantizes BF16 activations → E2M1 packed uint8 with UE4M3 scales.
|
||||
"""
|
||||
import triton
|
||||
import triton.language as tl
|
||||
import torch
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _deepseek_v4_stage_mega_moe_inputs_kernel(
|
||||
hidden_states,
|
||||
x_fp4, # uint8, shape (M, K//2) — E2M1 packed, 2 values per byte
|
||||
x_sf, # int32, shape (M, K//64) — UE4M3 packed, 4 scales per int32
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
topk_idx_out,
|
||||
topk_weights_out,
|
||||
hidden_stride_m: tl.constexpr,
|
||||
hidden_stride_k: tl.constexpr,
|
||||
x_stride_m: tl.constexpr,
|
||||
x_stride_k: tl.constexpr,
|
||||
x_sf_stride_m: tl.constexpr,
|
||||
x_sf_stride_k: tl.constexpr,
|
||||
topk_ids_stride_m: tl.constexpr,
|
||||
topk_ids_stride_k: tl.constexpr,
|
||||
topk_weights_stride_m: tl.constexpr,
|
||||
topk_weights_stride_k: tl.constexpr,
|
||||
topk_idx_stride_m: tl.constexpr,
|
||||
topk_idx_stride_k: tl.constexpr,
|
||||
topk_weights_out_stride_m: tl.constexpr,
|
||||
topk_weights_out_stride_k: tl.constexpr,
|
||||
hidden_size: tl.constexpr,
|
||||
top_k: tl.constexpr,
|
||||
BLOCK_K: tl.constexpr, # 128 elements (loaded from hidden)
|
||||
GROUP_K: tl.constexpr, # 16 (NVFP4 group_size)
|
||||
BLOCK_TOPK: tl.constexpr,
|
||||
) -> None:
|
||||
token_id = tl.program_id(0)
|
||||
k_block_id = tl.program_id(1)
|
||||
|
||||
k_offsets = k_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
|
||||
k_mask = k_offsets < hidden_size
|
||||
hidden = tl.load(
|
||||
hidden_states + token_id * hidden_stride_m + k_offsets * hidden_stride_k,
|
||||
mask=k_mask,
|
||||
other=0.0,
|
||||
).to(tl.float32)
|
||||
|
||||
num_groups: tl.constexpr = BLOCK_K // GROUP_K # 8
|
||||
hidden_groups = tl.reshape(hidden, [num_groups, GROUP_K])
|
||||
abs_groups = tl.reshape(tl.abs(hidden), [num_groups, GROUP_K])
|
||||
amax = tl.max(abs_groups, axis=1)
|
||||
amax = tl.maximum(amax, 1.0e-4)
|
||||
|
||||
# ---- UE4M3 scale computation ----
|
||||
# scale = amax / 6.0 (E2M1 max value = 6)
|
||||
# Then quantize scale to UE4M3 format
|
||||
scale = amax / 6.0
|
||||
scale_bits = scale.to(tl.uint32, bitcast=True)
|
||||
scale_exp = (scale_bits >> 23) & 0xFF
|
||||
scale_mant = scale_bits & 0x7FFFFF
|
||||
|
||||
# Convert FP32 → E4M3 manually (with subnormal support)
|
||||
# FP32 bias=127, E4M3 bias=7 → raw exp = scale_exp - 120
|
||||
e4m3_exp_raw = scale_exp - 120 # can be negative → subnormal
|
||||
|
||||
# Normal path: exp >= 1, just truncate mantissa top 3 bits
|
||||
# RNE rounding: need guard (bit 19), sticky (OR of bits 18:0), and LSB of result
|
||||
normal_mant = scale_mant >> 20
|
||||
guard_bit = (scale_mant >> 19) & 1
|
||||
sticky_bit = tl.where((scale_mant & 0x7FFFF) != 0, 1, 0) # OR of bits [18:0]
|
||||
result_lsb = normal_mant & 1
|
||||
# RNE: round up if (guard=1 and sticky=1) or (guard=1 and sticky=0 and lsb=1)
|
||||
round_up = guard_bit & (sticky_bit | result_lsb)
|
||||
normal_mant = normal_mant + round_up
|
||||
normal_exp = e4m3_exp_raw
|
||||
|
||||
# Subnormal path: exp_raw <= 0
|
||||
# Insert implicit leading 1 and right-shift by (1 - exp_raw)
|
||||
# E4M3 subnormal: value = (mant/8) * 2^(1-7) = (mant/8) * 2^-6
|
||||
# So we need: (1 + mant_fp32/2^23) * 2^(exp_raw - 7) = (shifted_mant/8) * 2^-6
|
||||
# shifted_mant = (implicit_1 | mant_fp32) >> (1 - exp_raw - 1) then take top 3 bits
|
||||
shift = 1 - e4m3_exp_raw # positive when subnormal
|
||||
mant_with_leading = (0x800000 | scale_mant) # insert implicit 1
|
||||
# Right-shift to get into the 3-bit E4M3 mantissa window
|
||||
# We want bits [shift+19 : shift+23) of mant_with_leading for 3 mantissa bits + 1 round bit
|
||||
subnormal_mant = (mant_with_leading >> (shift.to(tl.int32) + 20)) & 0x7
|
||||
sub_guard_bit = (mant_with_leading >> (shift.to(tl.int32) + 19)) & 1
|
||||
# Sticky: OR of all bits below the guard bit in the shifted result
|
||||
# shift ≤ 8 in practice (amax floor = 1e-4 → scale ≈ 2^-15 → exp_raw ≈ -7), so mask ≤ 2^27
|
||||
sub_sticky_mask = (1 << (shift.to(tl.int32) + 19)) - 1
|
||||
sub_sticky_bit = tl.where((mant_with_leading & sub_sticky_mask) != 0, 1, 0)
|
||||
sub_result_lsb = subnormal_mant & 1
|
||||
sub_round_up = sub_guard_bit & (sub_sticky_bit | sub_result_lsb)
|
||||
subnormal_mant = subnormal_mant + sub_round_up
|
||||
|
||||
is_normal = e4m3_exp_raw >= 1
|
||||
e4m3_mant = tl.where(is_normal, normal_mant, subnormal_mant)
|
||||
e4m3_exp = tl.where(is_normal, normal_exp, 0) # exp=0 for subnormals
|
||||
|
||||
# Handle mantissa overflow after rounding
|
||||
overflow = e4m3_mant >= 8
|
||||
e4m3_mant = tl.where(overflow, 0, e4m3_mant)
|
||||
e4m3_exp = tl.where(overflow, e4m3_exp + 1, e4m3_exp)
|
||||
e4m3_exp = tl.maximum(e4m3_exp, 0)
|
||||
e4m3_exp = tl.minimum(e4m3_exp, 15)
|
||||
scale_e4m3_bits = (e4m3_exp << 3) | e4m3_mant
|
||||
|
||||
# Reconstruct dequantized scale by decoding the STORED E4M3 bits.
|
||||
# This guarantees the E2M1 quantization divides by exactly the value
|
||||
# the CUDA kernel will multiply back — same bits, single decode, no
|
||||
# possibility of encode/decode disagreement.
|
||||
stored_exp = (scale_e4m3_bits >> 3) & 0xF
|
||||
stored_mant = scale_e4m3_bits & 0x7
|
||||
e4m3_exp_for_recon = tl.maximum(stored_exp.to(tl.int32) - 7, -126)
|
||||
two_pow_exp_bits = (e4m3_exp_for_recon + 127).to(tl.uint32) << 23
|
||||
two_pow_exp = two_pow_exp_bits.to(tl.float32, bitcast=True)
|
||||
normal_value = (1.0 + stored_mant.to(tl.float32) / 8.0) * two_pow_exp
|
||||
subnormal_value = (stored_mant.to(tl.float32) / 8.0) * 0.015625
|
||||
e4m3_value = tl.where(stored_exp == 0, subnormal_value, normal_value)
|
||||
|
||||
# ---- E2M1 FP4 quantization (unpacked, 1 byte/element) ----
|
||||
# E2M1 LUT (unsigned): [0, 0.5, 1, 1.5, 2, 3, 4, 6]
|
||||
# Nearest-neighbor using thresholds (midpoints between consecutive values)
|
||||
scaled = hidden_groups * (1.0 / tl.maximum(e4m3_value, 1e-6))[:, None]
|
||||
# Clamp to E2M1 range [-6, 6]
|
||||
scaled = tl.maximum(scaled, -6.0)
|
||||
scaled = tl.minimum(scaled, 6.0)
|
||||
|
||||
abs_s = tl.abs(scaled)
|
||||
# Thresholds: midpoints between [0, 0.5, 1, 1.5, 2, 3, 4, 6]
|
||||
# [0, 0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0, INF]
|
||||
e2m1_idx = tl.where(abs_s < 0.25, 0,
|
||||
tl.where(abs_s < 0.75, 1,
|
||||
tl.where(abs_s < 1.25, 2,
|
||||
tl.where(abs_s < 1.75, 3,
|
||||
tl.where(abs_s < 2.5, 4,
|
||||
tl.where(abs_s < 3.5, 5,
|
||||
tl.where(abs_s < 5.0, 6, 7)))))))
|
||||
sign_bit = (scaled < 0).to(tl.int32)
|
||||
e2m1_4bit = (sign_bit << 3) | e2m1_idx # 4-bit: (sign << 3) | index
|
||||
|
||||
# Pack E2M1 pairs into single bytes (2 per byte, low nibble first)
|
||||
# mxf4nvf4 reads FP4 packed from SMEM — must match kernel's TMA layout
|
||||
e2m1_flat = tl.reshape(e2m1_4bit, [BLOCK_K])
|
||||
e2m1_lo = e2m1_flat[0::2] # even indices → low nibble
|
||||
e2m1_hi = e2m1_flat[1::2] # odd indices → high nibble
|
||||
e2m1_packed = (e2m1_hi << 4 | e2m1_lo).to(tl.uint8) # [BLOCK_K // 2]
|
||||
|
||||
k_offsets_out = k_block_id * (BLOCK_K // 2) + tl.arange(0, BLOCK_K // 2)
|
||||
k_mask_out = k_offsets_out < (hidden_size // 2)
|
||||
tl.store(
|
||||
x_fp4 + token_id * x_stride_m + k_offsets_out * x_stride_k,
|
||||
e2m1_packed,
|
||||
mask=k_mask_out,
|
||||
)
|
||||
|
||||
# Pack UE4M3 bytes into int32 (NVFP4: group_size=16, 4 groups per 64 elements)
|
||||
# 8 groups per k_block of 128 → 2 int32s per k_block
|
||||
# int32 can only pack 4 bytes (shifts >= 32 are UB), so split into two packs
|
||||
scale_offsets = tl.arange(0, num_groups) # [0..7]
|
||||
first_half = scale_offsets < 4 # groups 0-3 → int32[0]
|
||||
second_half = scale_offsets >= 4 # groups 4-7 → int32[1]
|
||||
|
||||
packed_lo = tl.sum(
|
||||
tl.where(first_half, scale_e4m3_bits.to(tl.int32) << (scale_offsets * 8), 0),
|
||||
axis=0,
|
||||
).to(tl.int32)
|
||||
packed_hi = tl.sum(
|
||||
tl.where(second_half, scale_e4m3_bits.to(tl.int32) << ((scale_offsets - 4) * 8), 0),
|
||||
axis=0,
|
||||
).to(tl.int32)
|
||||
|
||||
# Write 2 int32s per k_block: x_sf shape is (M, K//64) = (M, num_k_blocks * 2)
|
||||
sf_base = token_id * x_sf_stride_m + k_block_id * 2 * x_sf_stride_k
|
||||
tl.store(x_sf + sf_base, packed_lo)
|
||||
tl.store(x_sf + sf_base + x_sf_stride_k, packed_hi)
|
||||
|
||||
if k_block_id == 0:
|
||||
topk_offsets = tl.arange(0, BLOCK_TOPK)
|
||||
topk_mask = topk_offsets < top_k
|
||||
|
||||
ids = tl.load(
|
||||
topk_ids + token_id * topk_ids_stride_m + topk_offsets * topk_ids_stride_k,
|
||||
mask=topk_mask,
|
||||
other=0,
|
||||
).to(tl.int64)
|
||||
tl.store(
|
||||
topk_idx_out
|
||||
+ token_id * topk_idx_stride_m
|
||||
+ topk_offsets * topk_idx_stride_k,
|
||||
ids,
|
||||
mask=topk_mask,
|
||||
)
|
||||
|
||||
weights = tl.load(
|
||||
topk_weights
|
||||
+ token_id * topk_weights_stride_m
|
||||
+ topk_offsets * topk_weights_stride_k,
|
||||
mask=topk_mask,
|
||||
other=0.0,
|
||||
)
|
||||
tl.store(
|
||||
topk_weights_out
|
||||
+ token_id * topk_weights_out_stride_m
|
||||
+ topk_offsets * topk_weights_out_stride_k,
|
||||
weights,
|
||||
mask=topk_mask,
|
||||
)
|
||||
|
||||
|
||||
def _stage_deepseek_v4_mega_moe_inputs(
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
x_fp4: torch.Tensor, # uint8, shape (M, K//2)
|
||||
x_sf: torch.Tensor, # int32, shape (M, K//64)
|
||||
topk_idx_out: torch.Tensor,
|
||||
topk_weights_out: torch.Tensor,
|
||||
) -> None:
|
||||
num_tokens, hidden_size = hidden_states.shape
|
||||
if num_tokens == 0:
|
||||
return
|
||||
if hidden_size % 128 != 0:
|
||||
raise ValueError(
|
||||
"DeepSeek V4 MegaMoE input staging requires hidden_size to be "
|
||||
"a multiple of 128."
|
||||
)
|
||||
top_k = topk_ids.shape[1]
|
||||
if topk_weights.shape != topk_ids.shape:
|
||||
raise ValueError(
|
||||
"DeepSeek V4 MegaMoE input staging requires topk_weights and "
|
||||
"topk_ids to have the same shape."
|
||||
)
|
||||
|
||||
block_k = 128
|
||||
grid = (num_tokens, triton.cdiv(hidden_size, block_k))
|
||||
block_topk = triton.next_power_of_2(top_k)
|
||||
_deepseek_v4_stage_mega_moe_inputs_kernel[grid](
|
||||
hidden_states,
|
||||
x_fp4,
|
||||
x_sf,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
topk_idx_out,
|
||||
topk_weights_out,
|
||||
hidden_states.stride(0),
|
||||
hidden_states.stride(1),
|
||||
x_fp4.stride(0),
|
||||
x_fp4.stride(1),
|
||||
x_sf.stride(0),
|
||||
x_sf.stride(1),
|
||||
topk_ids.stride(0),
|
||||
topk_ids.stride(1),
|
||||
topk_weights.stride(0),
|
||||
topk_weights.stride(1),
|
||||
topk_idx_out.stride(0),
|
||||
topk_idx_out.stride(1),
|
||||
topk_weights_out.stride(0),
|
||||
topk_weights_out.stride(1),
|
||||
hidden_size,
|
||||
top_k,
|
||||
BLOCK_K=block_k,
|
||||
GROUP_K=16, # NVFP4: group_size=16 (scale_vec::4X)
|
||||
BLOCK_TOPK=block_topk,
|
||||
num_warps=4,
|
||||
)
|
||||
@@ -1,166 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""NVIDIA Model Optimizer PTQ for DeepSeek V4 Pro → NVFP4.
|
||||
|
||||
Uses nvidia-modelopt's official PTQ pipeline with NVFP4Experts-Only config,
|
||||
which quantizes only MoE expert layers while keeping attention QKV in higher
|
||||
precision — the recommended approach for DeepSeek MoE models.
|
||||
|
||||
Output is a Unified HuggingFace checkpoint deployable on TRT-LLM / vLLM / SGLang.
|
||||
|
||||
Usage:
|
||||
python quantize_modelopt.py \
|
||||
--model /root/nvidia-meeting/DeepSeek-V4-Pro \
|
||||
--export_dir /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4-modelopt \
|
||||
--qformat nvfp4_experts_only \
|
||||
--tp 8 \
|
||||
--calib_size 256
|
||||
|
||||
For the FP8 source variant, just change --model path. modelopt handles
|
||||
dequantization internally.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import modelopt.torch.opt as mto
|
||||
import modelopt.torch.quantization as mtq
|
||||
from modelopt.torch.export import export_hf_checkpoint
|
||||
from modelopt.torch.utils.dataset_utils import create_forward_loop
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
|
||||
mto.enable_huggingface_checkpointing()
|
||||
|
||||
|
||||
QUANT_CONFIGS = {
|
||||
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
|
||||
"nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG,
|
||||
"nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
|
||||
"nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG,
|
||||
"fp8": mtq.FP8_DEFAULT_CFG,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Model Optimizer PTQ for DeepSeek V4 Pro")
|
||||
ap.add_argument("--model", required=True, help="Path to HF model (BF16 or FP8)")
|
||||
ap.add_argument("--export_dir", required=True, help="Output directory for quantized checkpoint")
|
||||
ap.add_argument("--qformat", default="nvfp4_experts_only",
|
||||
choices=list(QUANT_CONFIGS.keys()),
|
||||
help="Quantization format (default: nvfp4_experts_only for MoE)")
|
||||
ap.add_argument("--kv_cache_qformat", default="fp8_cast",
|
||||
help="KV cache quantization (default: fp8_cast, fast no-calib)")
|
||||
ap.add_argument("--tp", type=int, default=8, help="Tensor parallelism for export")
|
||||
ap.add_argument("--calib_size", type=int, nargs="+", default=[256],
|
||||
help="Calibration dataset size (per dataset)")
|
||||
ap.add_argument("--batch_size", type=int, default=1, help="Calibration batch size")
|
||||
ap.add_argument("--calib_seq", type=int, default=4096, help="Max calibration sequence length")
|
||||
ap.add_argument("--trust_remote_code", action="store_true", default=True,
|
||||
help="Trust remote code (required for V4)")
|
||||
ap.add_argument("--use_seq_device_map", action="store_true",
|
||||
help="Use sequential device map for low-memory calibration")
|
||||
ap.add_argument("--low_memory_mode", action="store_true",
|
||||
help="Compress weights before calibration (FP8/NVFP4 only)")
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"=== Model Optimizer PTQ ===")
|
||||
print(f" Model: {args.model}")
|
||||
print(f" QFormat: {args.qformat}")
|
||||
print(f" KV Cache: {args.kv_cache_qformat}")
|
||||
print(f" TP: {args.tp}")
|
||||
print(f" Calib: {args.calib_size} samples, seq_len={args.calib_seq}")
|
||||
print()
|
||||
|
||||
# Seed everything
|
||||
random.seed(1234)
|
||||
np.random.seed(1234)
|
||||
torch.manual_seed(1234)
|
||||
|
||||
# Load tokenizer
|
||||
print("Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.model,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
padding_side="left",
|
||||
)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Load model
|
||||
print("Loading model...")
|
||||
model_kwargs = {
|
||||
"trust_remote_code": args.trust_remote_code,
|
||||
"torch_dtype": torch.bfloat16,
|
||||
}
|
||||
if args.use_seq_device_map:
|
||||
model_kwargs["device_map"] = "auto"
|
||||
model_kwargs["offload_folder"] = "offload"
|
||||
model_kwargs["offload_state_dict"] = True
|
||||
model_kwargs["max_memory"] = {i: "100GiB" for i in range(8)}
|
||||
model_kwargs["max_memory"]["cpu"] = "2500GiB"
|
||||
elif args.low_memory_mode:
|
||||
# Load entirely on CPU, modelopt will handle placement
|
||||
model_kwargs["device_map"] = {"": "cpu"}
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
|
||||
|
||||
if not args.use_seq_device_map and not args.low_memory_mode:
|
||||
model = model.cuda()
|
||||
|
||||
# Build calibration dataloader
|
||||
print("Building calibration dataset...")
|
||||
calib_dataloader = get_dataloader(
|
||||
tokenizer=tokenizer,
|
||||
calib_size=args.calib_size,
|
||||
batch_size=args.batch_size,
|
||||
calib_seq=args.calib_seq,
|
||||
)
|
||||
|
||||
# Build forward loop for calibration
|
||||
def forward_loop(model):
|
||||
for batch in calib_dataloader:
|
||||
model(**batch)
|
||||
|
||||
# Quantize
|
||||
quant_cfg = QUANT_CONFIGS[args.qformat]
|
||||
print(f"Running PTQ with {args.qformat}...")
|
||||
t0 = time.time()
|
||||
|
||||
model = mtq.quantize(model, quant_cfg, forward_loop)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"Quantization complete in {elapsed/60:.1f} min")
|
||||
|
||||
# Export
|
||||
print(f"Exporting to {args.export_dir} ...")
|
||||
with torch.inference_mode():
|
||||
export_hf_checkpoint(
|
||||
model,
|
||||
args.export_dir,
|
||||
tokenizer=tokenizer,
|
||||
export_tensorrt_llm_plugins=True,
|
||||
)
|
||||
|
||||
print(f"Done. Output at {args.export_dir}")
|
||||
|
||||
|
||||
def get_dataloader(tokenizer, calib_size, batch_size, calib_seq):
|
||||
"""Create calibration dataloader using modelopt's built-in dataset utils."""
|
||||
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
|
||||
|
||||
return get_dataset_dataloader(
|
||||
tokenizer=tokenizer,
|
||||
num_samples=calib_size[0],
|
||||
batch_size=batch_size,
|
||||
max_sample_length=calib_seq,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
276
scripts/dequant_fp8_to_bf16.py
Normal file
276
scripts/dequant_fp8_to_bf16.py
Normal file
@@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete dequantization of DeepSeek V4 Pro mixed-precision to pure BF16.
|
||||
|
||||
Handles ALL compressed tensor types found in the mixed-precision model:
|
||||
|
||||
1. FP8 attention weights (float8_e4m3fn + float8_e8m0fnu block scales)
|
||||
- weight × scale_expanded → BF16
|
||||
- 128×128 block quantization
|
||||
|
||||
2. FP4 (E2M1) expert weights (int8 packed + float8_e8m0fnu block scales)
|
||||
- Unpack 2 FP4 values per int8 byte (lower nibble first, upper second)
|
||||
- Dequantize via E2M1 LUT lookup × scale_expanded → BF16
|
||||
- Per-row, 32-column block scaling (MXFP4 microscaling format)
|
||||
- Output dimensions are 2× the stored dimensions
|
||||
- Verified: nibble index 0 vs 8 ratio = 0.996 (FP4 -0.0 vs +0.0),
|
||||
NOT INT4 where index 8 = -8 would be rare
|
||||
|
||||
3. FP8 shared expert weights (float8_e4m3fn + float8_e8m0fnu block scales)
|
||||
- Same as FP8 attention dequantization
|
||||
|
||||
After dequantization, all weights are pure BF16. FP8Linear.forward() sees
|
||||
element_size() > 1 and falls back to F.linear(), avoiding broken FP8 kernels
|
||||
on Blackwell GPUs. The model can then be loaded by modelopt without shape
|
||||
mismatches.
|
||||
"""
|
||||
|
||||
import os, glob, json, shutil, sys, time
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
import torch
|
||||
|
||||
FP8_WEIGHT_DTYPE = torch.float8_e4m3fn
|
||||
FP8_SCALE_DTYPE = torch.float8_e8m0fnu
|
||||
BLOCK_SIZE_FP8 = (128, 128)
|
||||
FP4_BLOCK_SIZE = 32 # columns per scale value for MXFP4 expert weights
|
||||
|
||||
# E2M1 FP4 lookup table (MXFP4 microscaling format)
|
||||
# Index 0-7: positive values (sign=0, 2-bit exp, 1-bit mantissa)
|
||||
# Index 8-15: negative values (sign=1)
|
||||
# Mapping: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
|
||||
FP4_E2M1_LUT = torch.tensor([
|
||||
0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
|
||||
-0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
|
||||
], dtype=torch.float32)
|
||||
|
||||
|
||||
def dequantize_fp8_weight(fp8_weight: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
||||
"""Dequantize block-wise FP8 weight to BF16.
|
||||
|
||||
fp8_weight: (out_features, in_features) float8_e4m3fn
|
||||
scale: (out_features//128, in_features//128) float8_e8m0fnu
|
||||
"""
|
||||
scale_f32 = scale.float()
|
||||
out_features, in_features = fp8_weight.shape
|
||||
scale_expanded = scale_f32.repeat_interleave(BLOCK_SIZE_FP8[0], dim=0).repeat_interleave(BLOCK_SIZE_FP8[1], dim=1)
|
||||
scale_expanded = scale_expanded[:out_features, :in_features]
|
||||
weight_bf16 = fp8_weight.float() * scale_expanded
|
||||
return weight_bf16.to(torch.bfloat16)
|
||||
|
||||
|
||||
def dequantize_fp4_weight(int8_packed: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
||||
"""Dequantize MXFP4 (E2M1) expert weight to BF16.
|
||||
|
||||
FP4 values are packed 2-per-byte into int8 tensors.
|
||||
Lower nibble (bits 0-3) is the first value, upper nibble (bits 4-7) is the second.
|
||||
E2M1 format: 1 sign + 2 exponent + 1 mantissa bit.
|
||||
|
||||
Scale is per-row with 32-column blocks (float8_e8m0fnu, MX microscaling).
|
||||
Output dimensions are 2× the stored dimensions.
|
||||
|
||||
int8_packed: (out_features, in_features//2) int8
|
||||
scale: (out_features, in_features//32) float8_e8m0fnu
|
||||
returns: (out_features, in_features) bfloat16
|
||||
"""
|
||||
lut = FP4_E2M1_LUT.to(int8_packed.device)
|
||||
|
||||
# Unpack nibble indices
|
||||
lower_idx = (int8_packed & 0x0F).long() # 0-15
|
||||
upper_idx = ((int8_packed >> 4) & 0x0F).long() # 0-15
|
||||
|
||||
# LUT lookup
|
||||
lower = lut[lower_idx] # float32
|
||||
upper = lut[upper_idx] # float32
|
||||
|
||||
out_features = int8_packed.shape[0]
|
||||
in_features_full = int8_packed.shape[1] * 2 # 2× expansion
|
||||
|
||||
# Expand scale: (out_features, in_features//32) → (out_features, in_features)
|
||||
scale_f32 = scale.float()
|
||||
scale_expanded = scale_f32.repeat_interleave(FP4_BLOCK_SIZE, dim=1)
|
||||
scale_expanded = scale_expanded[:, :in_features_full]
|
||||
|
||||
# Interleave lower and upper nibbles
|
||||
unpacked = torch.empty(out_features, in_features_full, dtype=torch.float32, device=int8_packed.device)
|
||||
unpacked[:, 0::2] = lower
|
||||
unpacked[:, 1::2] = upper
|
||||
|
||||
# Dequantize: FP4 value × E8M0 scale
|
||||
bf16_weight = (unpacked * scale_expanded).to(torch.bfloat16)
|
||||
return bf16_weight
|
||||
|
||||
|
||||
def dequantize_model(model_dir: str, out_dir: str):
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
# Copy non-safetensor files
|
||||
print("Copying metadata files...")
|
||||
for f in os.listdir(model_dir):
|
||||
fp = os.path.join(model_dir, f)
|
||||
if not f.endswith(".safetensors") and os.path.isfile(fp):
|
||||
shutil.copy2(fp, os.path.join(out_dir, f))
|
||||
print(f" Copied {f}")
|
||||
|
||||
safetensor_files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
|
||||
total_shards = len(safetensor_files)
|
||||
print(f"Found {total_shards} shards")
|
||||
|
||||
# First pass: build scale-key → weight-key mapping
|
||||
print("\nScanning for weight+scale pairs...")
|
||||
scale_to_weight = {}
|
||||
for f in safetensor_files:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
if key.endswith(".scale"):
|
||||
weight_key = key[:-len(".scale")] + ".weight"
|
||||
scale_to_weight[key] = weight_key
|
||||
|
||||
weight_to_scale = {v: k for k, v in scale_to_weight.items()}
|
||||
print(f"Found {len(scale_to_weight)} weight+scale pairs")
|
||||
|
||||
# Classify weights by type (sample first 2 shards)
|
||||
fp4_weight_keys = set()
|
||||
fp8_weight_keys = set()
|
||||
scale_keys = set(scale_to_weight.keys())
|
||||
|
||||
for f in safetensor_files[:2]:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
if key in weight_to_scale:
|
||||
t = sf.get_tensor(key)
|
||||
if t.dtype == torch.int8:
|
||||
fp4_weight_keys.add(key)
|
||||
elif t.dtype == FP8_WEIGHT_DTYPE:
|
||||
fp8_weight_keys.add(key)
|
||||
|
||||
print(f" FP4 (E2M1) expert weights (packed): ~{len(fp4_weight_keys)} per shard")
|
||||
print(f" FP8 attention/shared-expert weights: ~{len(fp8_weight_keys)} per shard")
|
||||
|
||||
# Second pass: dequantize and save
|
||||
stats = {"fp4_dequantized": 0, "fp8_dequantized": 0, "scales_removed": 0, "unchanged": 0}
|
||||
start_time = time.time()
|
||||
|
||||
for i, f in enumerate(safetensor_files):
|
||||
shard_start = time.time()
|
||||
tensors = {}
|
||||
scales_in_shard = {}
|
||||
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
keys = list(sf.keys())
|
||||
|
||||
# First: collect scales
|
||||
for key in keys:
|
||||
if key in scale_keys:
|
||||
t = sf.get_tensor(key)
|
||||
scales_in_shard[key] = t
|
||||
|
||||
# Second: process weights and other tensors
|
||||
for key in keys:
|
||||
if key in scale_keys:
|
||||
continue # handled separately
|
||||
|
||||
t = sf.get_tensor(key)
|
||||
|
||||
if key in weight_to_scale and t.dtype == torch.int8:
|
||||
# FP4 (E2M1) packed expert weight (MXFP4 microscaling)
|
||||
scale_key = weight_to_scale[key]
|
||||
scale = scales_in_shard.get(scale_key)
|
||||
if scale is None:
|
||||
print(f" WARNING: scale {scale_key} not in same shard as {key}")
|
||||
tensors[key] = t # keep as-is
|
||||
continue
|
||||
bf16 = dequantize_fp4_weight(t, scale)
|
||||
tensors[key] = bf16
|
||||
stats["fp4_dequantized"] += 1
|
||||
del scales_in_shard[scale_key]
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
elif key in weight_to_scale and t.dtype == FP8_WEIGHT_DTYPE:
|
||||
# FP8 weight (attention or shared expert)
|
||||
scale_key = weight_to_scale[key]
|
||||
scale = scales_in_shard.get(scale_key)
|
||||
if scale is None:
|
||||
print(f" WARNING: scale {scale_key} not in same shard as {key}")
|
||||
tensors[key] = t
|
||||
continue
|
||||
bf16 = dequantize_fp8_weight(t, scale)
|
||||
tensors[key] = bf16
|
||||
stats["fp8_dequantized"] += 1
|
||||
del scales_in_shard[scale_key]
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
else:
|
||||
# Regular tensor (BF16, FP32, int64, etc.) - keep as-is
|
||||
tensors[key] = t
|
||||
stats["unchanged"] += 1
|
||||
|
||||
# Remove unused scales
|
||||
for sk in scales_in_shard:
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
out_path = os.path.join(out_dir, os.path.basename(f))
|
||||
if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
|
||||
# Resume: skip already-dequantized shards
|
||||
print(f"[{i+1}/{total_shards}] Skipping (already done): {os.path.basename(f)}")
|
||||
del tensors, scales_in_shard
|
||||
continue
|
||||
save_file(tensors, out_path)
|
||||
|
||||
shard_time = time.time() - shard_start
|
||||
elapsed = time.time() - start_time
|
||||
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
||||
eta = (total_shards - i - 1) / rate if rate > 0 else 0
|
||||
|
||||
print(f"[{i+1}/{total_shards}] {os.path.basename(f)} "
|
||||
f"({stats['fp4_dequantized']} fp4, {stats['fp8_dequantized']} fp8, "
|
||||
f"{stats['scales_removed']} scales rm) "
|
||||
f"[{shard_time:.1f}s, ETA: {eta/60:.0f}min]")
|
||||
|
||||
del tensors, scales_in_shard
|
||||
|
||||
# Update config
|
||||
cfg_path = os.path.join(out_dir, "config.json")
|
||||
if os.path.exists(cfg_path):
|
||||
cfg = json.load(open(cfg_path))
|
||||
cfg["torch_dtype"] = "bfloat16"
|
||||
cfg["_experts_implementation"] = "eager"
|
||||
if "quantization_config" in cfg:
|
||||
del cfg["quantization_config"]
|
||||
json.dump(cfg, open(cfg_path, "w"), indent=2)
|
||||
print(f"\nUpdated config.json: torch_dtype=bfloat16, _experts_implementation=eager")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
print(f"\nDone in {total_time/60:.1f} minutes!")
|
||||
print(f" FP4 expert weights dequantized: {stats['fp4_dequantized']}")
|
||||
print(f" FP8 weights dequantized: {stats['fp8_dequantized']}")
|
||||
print(f" Scale tensors removed: {stats['scales_removed']}")
|
||||
print(f" Unchanged tensors: {stats['unchanged']}")
|
||||
|
||||
# Verify no FP8/INT8 remaining
|
||||
print("\nVerifying...")
|
||||
remaining_compressed = 0
|
||||
for f in sorted(glob.glob(os.path.join(out_dir, "*.safetensors")))[:5]:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
t = sf.get_tensor(key)
|
||||
if t.dtype in (torch.float8_e8m0fnu, torch.float8_e4m3fn, torch.int8):
|
||||
remaining_compressed += 1
|
||||
if remaining_compressed <= 5:
|
||||
print(f" REMAINING: {key} {t.dtype} {t.shape}")
|
||||
if remaining_compressed == 0:
|
||||
print(" ✅ No compressed tensors remaining — model is pure BF16!")
|
||||
else:
|
||||
print(f" ⚠️ {remaining_compressed} compressed tensors still present")
|
||||
|
||||
out_size = sum(os.path.getsize(os.path.join(out_dir, f)) for f in os.listdir(out_dir) if f.endswith(".safetensors"))
|
||||
print(f"Output size: {out_size / 1e12:.2f} TB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Complete dequantization of DeepSeek V4 Pro to BF16")
|
||||
parser.add_argument("model_dir", help="Path to mixed-precision model")
|
||||
parser.add_argument("out_dir", help="Path to write dequantized BF16 model")
|
||||
args = parser.parse_args()
|
||||
dequantize_model(args.model_dir, args.out_dir)
|
||||
587
scripts/quantize_nvfp4.py
Normal file
587
scripts/quantize_nvfp4.py
Normal file
@@ -0,0 +1,587 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DeepSeek V4 Pro → NVFP4 quantization — defensive edition.
|
||||
|
||||
This script:
|
||||
1. Applies runtime patches for GPU tensor safety (before modelopt runs)
|
||||
2. Calls the SAME hf_ptq.py pipeline that the shell script uses
|
||||
3. After calibration, snapshots amax to CPU and saves model state
|
||||
|
||||
The key insight: we don't rewrite the pipeline. We let hf_ptq do its thing
|
||||
with all its args, defaults, and edge cases handled correctly. We just add
|
||||
our defensive patches and post-calibration saves.
|
||||
|
||||
Must be run from the modelopt example directory:
|
||||
cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
|
||||
|
||||
Usage:
|
||||
# Full run (calibrate + export):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
|
||||
|
||||
# Re-run export only (after a calibration save exists):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only
|
||||
|
||||
# Validate saved calibration state (check amax values):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --validate-only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"
|
||||
QUANT = "nvfp4"
|
||||
TP = 8
|
||||
CALIB_SIZE = 128
|
||||
CALIB_SEQ = 512
|
||||
KV_CACHE_QUANT = "fp8_cast"
|
||||
GPU_MEM_PCT = 0.7
|
||||
|
||||
HF_TOKEN = "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"
|
||||
|
||||
# Paths
|
||||
EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
|
||||
EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||||
CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
|
||||
AMAX_SNAPSHOT_PATH = "/root/nvidia-meeting/v4_nvfp4_amax_snapshots.pt"
|
||||
|
||||
|
||||
def apply_patches():
|
||||
"""Apply runtime patches for V4 compatibility and GPU tensor safety.
|
||||
|
||||
Root cause of all export crashes: use_seq_device_map keeps model weights on GPU
|
||||
for 5+ hours during calibration. By export time, CUDA's memory allocator has
|
||||
recycled the underlying memory, so any read of those GPU tensors triggers
|
||||
cudaErrorIllegalAddress.
|
||||
|
||||
Fix strategy: patch at the EARLIEST possible entry points to force stale GPU
|
||||
tensors to CPU before any downstream code reads them. This covers the full
|
||||
chain of execution we traced through the export path:
|
||||
|
||||
_process_quantized_modules
|
||||
→ _export_quantized_weight (or _export_fused_experts)
|
||||
→ get_weight_scaling_factor
|
||||
→ get_weights_scaling_factor_from_quantizer (reads weight, _amax, global_amax)
|
||||
→ NVFP4QTensor.get_weights_scaling_factor (dynamic: reduce_block_amax on weight)
|
||||
→ get_weight_scaling_factor_2 (reads _amax, global_amax)
|
||||
→ get_activation_scaling_factor (reads _amax) [already patched]
|
||||
→ to_quantized_weight (reads weight, does .to(weight.device) on scaling factors)
|
||||
→ weight.to(dtype) (reads weight)
|
||||
|
||||
By forcing weight to CPU in Patch 4 (_export_quantized_weight), ALL downstream
|
||||
.to(weight.device) calls resolve to CPU. Patches 5-8 are belt-and-suspenders.
|
||||
"""
|
||||
|
||||
from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module
|
||||
from modelopt.torch.quantization.qtensor import nvfp4_tensor
|
||||
from modelopt.torch.export import quant_utils
|
||||
from modelopt.torch.quantization.utils import quantizer_attr_names as _quantizer_attr_names
|
||||
import modelopt.torch.export.unified_export_hf as uehf
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 1: load_calib_amax — force _amax to CPU immediately after calibration
|
||||
# This runs during calibration, right after each quantizer finishes.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax
|
||||
|
||||
def patched_load_calib_amax(self, *args, **kwargs):
|
||||
orig_load_calib_amax(self, *args, **kwargs)
|
||||
if hasattr(self, '_amax') and self._amax is not None:
|
||||
self._amax = self._amax.cpu()
|
||||
|
||||
tq_module.TensorQuantizer.load_calib_amax = patched_load_calib_amax
|
||||
print("✓ Patch 1: TensorQuantizer.load_calib_amax → force _amax to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 2: export_amax — CPU safety net at export time
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_amax = tq_module.TensorQuantizer.export_amax
|
||||
|
||||
def patched_export_amax(self):
|
||||
if hasattr(self, '_amax') and self._amax is not None and self._amax.is_cuda:
|
||||
self._amax = self._amax.cpu()
|
||||
return orig_export_amax(self)
|
||||
|
||||
tq_module.TensorQuantizer.export_amax = patched_export_amax
|
||||
print("✓ Patch 2: TensorQuantizer.export_amax → CPU fallback")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 3: get_activation_scaling_factor — CPU + clamp
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
@classmethod
|
||||
def patched_get_activation_scaling_factor(cls, quantizer):
|
||||
if not quantizer.is_enabled:
|
||||
return None
|
||||
try:
|
||||
amax = quantizer.export_amax()
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: export_amax() failed ({e}), attempting CPU recovery...")
|
||||
if hasattr(quantizer, '_amax') and quantizer._amax is not None:
|
||||
quantizer._amax = quantizer._amax.cpu()
|
||||
amax = quantizer.export_amax()
|
||||
|
||||
if amax is None:
|
||||
return None
|
||||
amax = amax.cpu()
|
||||
activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0)
|
||||
|
||||
if not torch.all(activation_scaling_factor > 0):
|
||||
n_bad = (activation_scaling_factor <= 0).sum().item()
|
||||
n_total = activation_scaling_factor.numel()
|
||||
print(f" WARNING: {n_bad}/{n_total} activation scaling factors <= 0, clamping")
|
||||
activation_scaling_factor = activation_scaling_factor.clamp(min=torch.finfo(torch.float32).tiny)
|
||||
|
||||
return activation_scaling_factor
|
||||
|
||||
nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor
|
||||
print("✓ Patch 3: NVFP4QTensor.get_activation_scaling_factor → CPU + clamp")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 4: _export_quantized_weight — THE KEY PATCH
|
||||
#
|
||||
# This is the entry point for exporting each quantized module. It reads
|
||||
# `weight = getattr(sub_module, weight_name)` which is on a stale GPU.
|
||||
# By moving weight to CPU right here, ALL downstream functions are safe:
|
||||
# - get_weight_scaling_factor: weight.device is now CPU
|
||||
# - get_weights_scaling_factor: operates on CPU weight
|
||||
# - to_quantized_weight: .to(weight.device) stays on CPU
|
||||
# - weight.to(dtype): CPU cast
|
||||
# We also force all quantizer state to CPU for the same reason.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_quantized_weight = uehf._export_quantized_weight
|
||||
|
||||
def patched_export_quantized_weight(sub_module, dtype, weight_name="weight"):
|
||||
# Move weight to CPU (stale GPU → safe CPU)
|
||||
weight = getattr(sub_module, weight_name, None)
|
||||
if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
try:
|
||||
weight_cpu = weight.cpu()
|
||||
with torch.no_grad():
|
||||
setattr(sub_module, weight_name, torch.nn.Parameter(weight_cpu))
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: weight.cpu() failed for {weight_name} ({e})")
|
||||
raise
|
||||
|
||||
# Force all quantizer state to CPU
|
||||
qattrs = _quantizer_attr_names(weight_name)
|
||||
for qattr in [qattrs.weight_quantizer, qattrs.input_quantizer, qattrs.output_quantizer]:
|
||||
if not qattr:
|
||||
continue
|
||||
quantizer = getattr(sub_module, qattr, None)
|
||||
if quantizer is None:
|
||||
continue
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Handle SequentialQuantizer (W4A8 path)
|
||||
if hasattr(quantizer, 'quantizers'):
|
||||
for sub_q in quantizer.quantizers:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(sub_q, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(sub_q, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
|
||||
return orig_export_quantized_weight(sub_module, dtype, weight_name)
|
||||
|
||||
uehf._export_quantized_weight = patched_export_quantized_weight
|
||||
print("✓ Patch 4: _export_quantized_weight → force weight + quantizer state to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 5: _export_fused_experts — same treatment for MoE expert weights
|
||||
# DeepseekV4Experts go through this different code path.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_fused_experts = uehf._export_fused_experts
|
||||
|
||||
def patched_export_fused_experts(sub_module, dtype):
|
||||
# Force all expert weights to CPU
|
||||
for name, param in list(sub_module.named_parameters()):
|
||||
if isinstance(param, torch.Tensor) and param.is_cuda:
|
||||
try:
|
||||
with torch.no_grad():
|
||||
setattr(sub_module, name, torch.nn.Parameter(param.cpu()))
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Force all buffers to CPU
|
||||
for name, buf in list(sub_module.named_buffers()):
|
||||
if isinstance(buf, torch.Tensor) and buf.is_cuda:
|
||||
try:
|
||||
sub_module.register_buffer(name, buf.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Force all quantizer state to CPU
|
||||
for mod in sub_module.modules():
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(mod, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(mod, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_export_fused_experts(sub_module, dtype)
|
||||
|
||||
uehf._export_fused_experts = patched_export_fused_experts
|
||||
print("✓ Patch 5: _export_fused_experts → force expert weights + quantizer state to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 6: to_quantized_weight — force scaling factors to CPU
|
||||
# This does .to(weight.device) on scaling factors. With weight now on
|
||||
# CPU (Patch 4), this should be a no-op, but belt-and-suspenders.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_to_quantized_weight = quant_utils.to_quantized_weight
|
||||
|
||||
def patched_to_quantized_weight(weight, weights_scaling_factor, quantization,
|
||||
weights_scaling_factor2=None, block_size=None):
|
||||
if isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
weight = weight.cpu()
|
||||
if weights_scaling_factor is not None and isinstance(weights_scaling_factor, torch.Tensor) and weights_scaling_factor.is_cuda:
|
||||
weights_scaling_factor = weights_scaling_factor.cpu()
|
||||
if weights_scaling_factor2 is not None and isinstance(weights_scaling_factor2, torch.Tensor) and weights_scaling_factor2.is_cuda:
|
||||
weights_scaling_factor2 = weights_scaling_factor2.cpu()
|
||||
return orig_to_quantized_weight(weight, weights_scaling_factor, quantization,
|
||||
weights_scaling_factor2, block_size)
|
||||
|
||||
quant_utils.to_quantized_weight = patched_to_quantized_weight
|
||||
print("✓ Patch 6: to_quantized_weight → force all tensors to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 7: get_weight_scaling_factor — force weight + quantizer to CPU
|
||||
# Belt and suspenders: Patch 4 should handle this, but this is also
|
||||
# called from other code paths.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_get_weight_scaling_factor = quant_utils.get_weight_scaling_factor
|
||||
|
||||
def patched_get_weight_scaling_factor(module, weight_name="weight"):
|
||||
weight = getattr(module, weight_name, None)
|
||||
if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
try:
|
||||
with torch.no_grad():
|
||||
setattr(module, weight_name, torch.nn.Parameter(weight.cpu()))
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: weight.cpu() failed in get_weight_scaling_factor ({e})")
|
||||
raise
|
||||
weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
|
||||
if weight_quantizer is not None:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(weight_quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(weight_quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_get_weight_scaling_factor(module, weight_name)
|
||||
|
||||
quant_utils.get_weight_scaling_factor = patched_get_weight_scaling_factor
|
||||
print("✓ Patch 7: get_weight_scaling_factor → force weight + quantizer to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 8: get_weight_scaling_factor_2 — force quantizer to CPU
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_get_weight_scaling_factor_2 = quant_utils.get_weight_scaling_factor_2
|
||||
|
||||
def patched_get_weight_scaling_factor_2(module, weight_name="weight"):
|
||||
weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
|
||||
if weight_quantizer is not None:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(weight_quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(weight_quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_get_weight_scaling_factor_2(module, weight_name)
|
||||
|
||||
quant_utils.get_weight_scaling_factor_2 = patched_get_weight_scaling_factor_2
|
||||
print("✓ Patch 8: get_weight_scaling_factor_2 → force quantizer to CPU")
|
||||
|
||||
|
||||
def snapshot_amax_to_cpu(model, snapshot_path):
|
||||
"""Walk all quantizers, copy _amax to CPU, save to disk."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
print(f"\nSnapshotting quantizer _amax to CPU...")
|
||||
t0 = time.time()
|
||||
snapshots = {}
|
||||
n_moved = 0
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
if hasattr(module, '_amax') and module._amax is not None:
|
||||
amax_cpu = module._amax.detach().cpu().clone()
|
||||
snapshots[name] = amax_cpu
|
||||
module._amax.data.copy_(amax_cpu)
|
||||
n_moved += 1
|
||||
|
||||
torch.save(snapshots, snapshot_path)
|
||||
size_mb = os.path.getsize(snapshot_path) / (1024**2)
|
||||
print(f"✓ Snapshotted {n_moved} quantizer _amax tensors to CPU ({time.time()-t0:.1f}s)")
|
||||
print(f" Saved to: {snapshot_path} ({size_mb:.1f} MB)")
|
||||
return snapshots
|
||||
|
||||
|
||||
def restore_amax_from_snapshot(model, snapshot_path):
|
||||
"""Restore _amax from a previously saved CPU snapshot."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
print(f"Restoring _amax from snapshot: {snapshot_path}")
|
||||
snapshots = torch.load(snapshot_path, map_location='cpu')
|
||||
n_restored = 0
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
if name in snapshots and hasattr(module, '_amax'):
|
||||
module._amax.data.copy_(snapshots[name].to(module._amax.device))
|
||||
n_restored += 1
|
||||
|
||||
print(f"✓ Restored {n_restored} _amax tensors from snapshot")
|
||||
|
||||
|
||||
def force_all_amax_to_cpu(model):
|
||||
"""Force ALL quantizer tensors to CPU."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
count = 0
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
for attr in ['_amax', '_pre_quant_scale', '_global_amax']:
|
||||
if hasattr(module, attr):
|
||||
val = getattr(module, attr)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
setattr(module, attr, val.cpu())
|
||||
count += 1
|
||||
print(f"✓ Forced {count} quantizer tensors to CPU")
|
||||
|
||||
|
||||
def save_calibrated_state(model, path):
|
||||
"""Save model state dict after calibration."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SAVING CALIBRATED STATE → {path}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
start = time.time()
|
||||
state = {
|
||||
'model_state_dict': model.state_dict(),
|
||||
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
torch.save(state, path)
|
||||
size_gb = os.path.getsize(path) / (1024**3)
|
||||
print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)")
|
||||
print(f" Path: {path}")
|
||||
print(f" Re-run with --export-only to retry export.\n")
|
||||
|
||||
|
||||
def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, calib_size, calib_seq):
|
||||
"""Full pipeline: parse args via hf_ptq → load → quantize → snapshot → save → export."""
|
||||
|
||||
os.chdir(EXAMPLE_DIR)
|
||||
sys.path.insert(0, EXAMPLE_DIR)
|
||||
|
||||
os.environ["HF_TOKEN"] = HF_TOKEN
|
||||
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
||||
|
||||
from hf_ptq import parse_args, main as hf_main
|
||||
|
||||
apply_patches()
|
||||
|
||||
# ── Build args using hf_ptq's own parser ──
|
||||
# This guarantees ALL attributes exist with correct defaults.
|
||||
# We temporarily replace sys.argv so parse_args() sees our config.
|
||||
saved_argv = sys.argv
|
||||
sys.argv = [
|
||||
"hf_ptq.py",
|
||||
"--pyt_ckpt_path", model_path,
|
||||
"--qformat", QUANT,
|
||||
"--calib_size", str(calib_size),
|
||||
"--calib_seq", str(calib_seq),
|
||||
"--kv_cache_qformat", KV_CACHE_QUANT,
|
||||
"--inference_tensor_parallel", str(TP),
|
||||
"--export_path", export_dir,
|
||||
"--trust_remote_code",
|
||||
"--use_seq_device_map",
|
||||
"--gpu_max_mem_percentage", str(GPU_MEM_PCT),
|
||||
"--batch_size", "0",
|
||||
]
|
||||
args = parse_args()
|
||||
sys.argv = saved_argv
|
||||
|
||||
# Apply the same post-parse conversions that hf_ptq's __main__ block does
|
||||
# (these normally run between parse_args() and main() in the original script,
|
||||
# but since we call main() directly, we have to do them ourselves)
|
||||
args.dataset = args.dataset.split(",") if isinstance(args.dataset, str) else args.dataset
|
||||
args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
|
||||
|
||||
# ── Post-calibration hook ──
|
||||
# We monkey-patch export_quantized to add our defensive saves before export.
|
||||
import hf_ptq
|
||||
|
||||
orig_export_quantized = hf_ptq.export_quantized
|
||||
|
||||
def patched_export_quantized(exp_args, full_model, language_model, model_type,
|
||||
tokenizer, default_padding_side, default_pad_token):
|
||||
"""Wrapper that snapshots amax and saves state before calling the real export."""
|
||||
print("\n" + "="*60)
|
||||
print("POST-CALIBRATION: Snapshotting amax and saving state")
|
||||
print("="*60)
|
||||
|
||||
# Snapshot amax to CPU
|
||||
snapshot_amax_to_cpu(language_model, amax_snapshot_path)
|
||||
|
||||
# Force all quantizer state to CPU
|
||||
force_all_amax_to_cpu(language_model)
|
||||
|
||||
# Free GPU memory
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
# Save calibrated state
|
||||
save_calibrated_state(language_model, calib_save_path)
|
||||
|
||||
# Now run the real export
|
||||
orig_export_quantized(exp_args, full_model, language_model, model_type,
|
||||
tokenizer, default_padding_side, default_pad_token)
|
||||
|
||||
hf_ptq.export_quantized = patched_export_quantized
|
||||
print("✓ Hooked export_quantized with amax snapshot + state save")
|
||||
|
||||
# ── Run hf_ptq's full pipeline ──
|
||||
# This handles model loading, quantization, calibration, and export
|
||||
# using the exact same code path as the shell script.
|
||||
hf_main(args)
|
||||
|
||||
|
||||
def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir):
|
||||
"""Load saved calibration state and run export only."""
|
||||
|
||||
os.chdir(EXAMPLE_DIR)
|
||||
sys.path.insert(0, EXAMPLE_DIR)
|
||||
|
||||
os.environ["HF_TOKEN"] = HF_TOKEN
|
||||
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
||||
|
||||
apply_patches()
|
||||
|
||||
from example_utils import get_model, get_tokenizer
|
||||
|
||||
print(f"Loading model from {model_path}...")
|
||||
model = get_model(
|
||||
model_path,
|
||||
device="cpu",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer = get_tokenizer(model_path, trust_remote_code=True)
|
||||
|
||||
print(f"Loading calibrated state from {calib_save_path}...")
|
||||
state = torch.load(calib_save_path, map_location='cpu')
|
||||
model.load_state_dict(state['model_state_dict'])
|
||||
print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")
|
||||
|
||||
force_all_amax_to_cpu(model)
|
||||
if amax_snapshot_path and os.path.exists(amax_snapshot_path):
|
||||
restore_amax_from_snapshot(model, amax_snapshot_path)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
from modelopt.torch.export import export_hf_checkpoint
|
||||
from hf_ptq import load_mtp_weights, copy_custom_model_files
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"EXPORTING → {export_dir}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path)
|
||||
if mtp_layer_prefixes:
|
||||
model._mtp_layer_prefixes = mtp_layer_prefixes
|
||||
|
||||
export_hf_checkpoint(model, export_dir=export_dir, extra_state_dict=mtp_state_dict)
|
||||
tokenizer.save_pretrained(export_dir)
|
||||
copy_custom_model_files(model_path, export_dir, True)
|
||||
print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}")
|
||||
except Exception as e:
|
||||
print(f"\n✗ EXPORT FAILED: {e}")
|
||||
print(f" Calibrated state: {CALIB_SAVE_PATH}")
|
||||
print(f" Amax snapshots: {AMAX_SNAPSHOT_PATH}")
|
||||
raise
|
||||
|
||||
|
||||
def run_validate(calib_save_path, amax_snapshot_path):
|
||||
"""Validate saved calibration state — check amax values are valid."""
|
||||
print(f"\nValidating calibration state...")
|
||||
|
||||
if os.path.exists(amax_snapshot_path):
|
||||
snapshots = torch.load(amax_snapshot_path, map_location='cpu')
|
||||
n_total = len(snapshots)
|
||||
n_valid = n_zero = n_nan = n_neg = 0
|
||||
|
||||
for name, amax in snapshots.items():
|
||||
if torch.any(torch.isnan(amax)):
|
||||
n_nan += 1
|
||||
elif torch.any(amax < 0):
|
||||
n_neg += 1
|
||||
elif torch.all(amax == 0):
|
||||
n_zero += 1
|
||||
else:
|
||||
n_valid += 1
|
||||
|
||||
print(f"\nAmax snapshot validation:")
|
||||
print(f" Total: {n_total} Valid: {n_valid} Zero: {n_zero} Neg: {n_neg} NaN: {n_nan}")
|
||||
if n_valid == n_total:
|
||||
print(f"\n✓ All {n_total} amax snapshots are valid!")
|
||||
else:
|
||||
print(f"\n✗ {n_total - n_valid} quantizers have invalid amax!")
|
||||
else:
|
||||
print(f"✗ No amax snapshot found at {amax_snapshot_path}")
|
||||
|
||||
if os.path.exists(calib_save_path):
|
||||
size_gb = os.path.getsize(calib_save_path) / (1024**3)
|
||||
print(f"\nCalibrated state: {calib_save_path} ({size_gb:.1f} GB)")
|
||||
else:
|
||||
print(f"\n✗ No calibrated state found at {calib_save_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="DeepSeek V4 Pro NVFP4 Quantization")
|
||||
parser.add_argument("--export-only", action="store_true",
|
||||
help="Skip calibration, load saved state and run export only")
|
||||
parser.add_argument("--validate-only", action="store_true",
|
||||
help="Validate saved calibration state without running anything")
|
||||
parser.add_argument("--model", default=MODEL, help="Path to BF16 model")
|
||||
parser.add_argument("--export-dir", default=EXPORT_DIR, help="Export output directory")
|
||||
parser.add_argument("--calib-save", default=CALIB_SAVE_PATH, help="Calibration state save path")
|
||||
parser.add_argument("--amax-snapshot", default=AMAX_SNAPSHOT_PATH, help="Amax snapshot path")
|
||||
parser.add_argument("--calib-size", type=int, default=CALIB_SIZE, help="Calibration samples")
|
||||
parser.add_argument("--calib-seq", type=int, default=CALIB_SEQ, help="Calibration sequence length")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.validate_only:
|
||||
run_validate(args.calib_save, args.amax_snapshot)
|
||||
elif args.export_only:
|
||||
if not os.path.exists(args.calib_save):
|
||||
print(f"ERROR: No calibration state found at {args.calib_save}")
|
||||
sys.exit(1)
|
||||
run_export_only(args.calib_save, args.amax_snapshot, args.model, args.export_dir)
|
||||
else:
|
||||
run_calibration(args.model, args.export_dir, args.calib_save,
|
||||
args.amax_snapshot, args.calib_size, args.calib_seq)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
scripts/serve_vllm.py
Normal file
100
scripts/serve_vllm.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server.
|
||||
|
||||
Run from the venv on the B200 node:
|
||||
source /root/nvidia-meeting/venv/bin/activate
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py
|
||||
|
||||
Or in the background:
|
||||
nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \
|
||||
> /root/nvidia-meeting/vllm_serve.log 2>&1 &
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# ── Patch: Add compress_ratios to DeepseekV4Config ──────────────────────────
|
||||
# transformers 5.8.0 renamed compress_ratios → compress_rates (dict format).
|
||||
# vllm 0.20.2 still expects compress_ratios as a list indexed by layer_id.
|
||||
# We patch the Config class to expose compress_ratios as a property that
|
||||
# converts the new dict format back to the list format vllm expects.
|
||||
import transformers
|
||||
try:
|
||||
from transformers import DeepseekV4Config
|
||||
|
||||
_orig_init = DeepseekV4Config.__init__
|
||||
|
||||
def _patched_init(self, *args, **kwargs):
|
||||
_orig_init(self, *args, **kwargs)
|
||||
# If compress_ratios already exists as a list, leave it alone
|
||||
if hasattr(self, 'compress_ratios') and isinstance(self.compress_ratios, list):
|
||||
return
|
||||
# Convert compress_rates dict → compress_ratios list
|
||||
if hasattr(self, 'compress_rates') and isinstance(self.compress_rates, dict):
|
||||
rates = self.compress_rates
|
||||
# Build per-layer list from the dict schema
|
||||
# V4 pattern: layers 0-1=128, then alternating 4/128, last=0
|
||||
n_layers = getattr(self, 'num_hidden_layers', 61)
|
||||
cr = rates.get('compressed_sparse_attention', 4)
|
||||
hr = rates.get('heavily_compressed_attention', 128)
|
||||
ratios = []
|
||||
for i in range(n_layers):
|
||||
if i < 2:
|
||||
ratios.append(hr)
|
||||
elif i == n_layers - 1:
|
||||
ratios.append(0)
|
||||
else:
|
||||
ratios.append(cr if i % 2 == 0 else hr)
|
||||
self.compress_ratios = ratios
|
||||
elif hasattr(self, 'compress_rates') and isinstance(self.compress_rates, list):
|
||||
self.compress_ratios = self.compress_rates
|
||||
|
||||
DeepseekV4Config.__init__ = _patched_init
|
||||
print("✓ Patched DeepseekV4Config.__init__ to add compress_ratios")
|
||||
except ImportError:
|
||||
print("⚠ DeepseekV4Config not found, skipping compress_ratios patch")
|
||||
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||||
|
||||
# These flags are critical for V4 — do not change without understanding why:
|
||||
# --trust-remote-code V4 needs custom modeling code
|
||||
# --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization
|
||||
# --block-size 256 V4 recommended block size
|
||||
# --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE)
|
||||
# --tensor-parallel-size 8 8× B200
|
||||
# --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops
|
||||
# --attention_config FP4 indexer cache for V4 MLA attention
|
||||
# --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell
|
||||
# --tokenizer-mode deepseek_v4 — V4-specific tokenizer
|
||||
# --tool-call-parser deepseek_v4 — native tool calling
|
||||
# --enable-auto-tool-choice Auto tool choice for function calling
|
||||
# --reasoning-parser deepseek_v4 — reasoning/thinking output parsing
|
||||
# --speculative_config MTP speculative decoding (2 speculative tokens)
|
||||
|
||||
cmd = [
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
|
||||
"--model", MODEL,
|
||||
"--trust-remote-code",
|
||||
"--kv-cache-dtype", "fp8",
|
||||
"--block-size", "256",
|
||||
"--enable-expert-parallel",
|
||||
"--tensor-parallel-size", "8",
|
||||
"--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}',
|
||||
"--attention_config.use_fp4_indexer_cache=True",
|
||||
"--moe-backend", "deep_gemm_mega_moe", # WARN: No NVFP4 mega_moe kernel. Use docker-compose (omits this flag) instead.
|
||||
"--tokenizer-mode", "deepseek_v4",
|
||||
"--tool-call-parser", "deepseek_v4",
|
||||
"--enable-auto-tool-choice",
|
||||
"--reasoning-parser", "deepseek_v4",
|
||||
"--speculative_config", '{"method":"mtp","num_speculative_tokens":2}',
|
||||
"--host", "0.0.0.0",
|
||||
"--port", "8000",
|
||||
]
|
||||
|
||||
print(f"Starting vLLM server for {MODEL}")
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print(f"Log: /root/nvidia-meeting/vllm_serve.log")
|
||||
print()
|
||||
|
||||
sys.exit(subprocess.call(cmd))
|
||||
Reference in New Issue
Block a user