Compare commits
69 Commits
master
...
modelopt-n
| Author | SHA1 | Date | |
|---|---|---|---|
| 0c77a88757 | |||
| f2656dcf6d | |||
| 7e2f219259 | |||
| cf54b4755a | |||
| 7febeaeb71 | |||
| 26aaaba4a2 | |||
| 67f9086a26 | |||
| 02b8ea536f | |||
| 653e2d7a50 | |||
| db16be8e5d | |||
| 6fd03a0aa0 | |||
| d88793dee6 | |||
| 30608e3834 | |||
| 0d74b97fb2 | |||
| f65d4ab99f | |||
| eb80bd6f80 | |||
| 07cd50e823 | |||
| efc111a11f | |||
| ce9056d259 | |||
| 5a72da7193 | |||
| 8612914169 | |||
| a300302486 | |||
| 1a36a655ea | |||
| b2849a8944 | |||
| a70593d886 | |||
| 25b4d8da06 | |||
| d1e15178b2 | |||
| 6c1bff6997 | |||
| 86dd8df302 | |||
| 99f861f48a | |||
| f9bbef8e91 | |||
| 94179ed9d0 | |||
| 03c10ab3b6 | |||
| 9438af5a8c | |||
| d7593fc1dd | |||
| 6eaba26914 | |||
| 3907838409 | |||
| 382c1d872f | |||
| 9291165ba0 | |||
| a0bacb3cf6 | |||
| 04304fdae6 | |||
| 50348989b2 | |||
| 24e3b3745d | |||
| b08afea425 | |||
| a2370006f7 | |||
| f1d21900ea | |||
| ca9a4f5eaa | |||
| eeba101cc4 | |||
| 075da675dc | |||
| 36e1342270 | |||
| 3d38e1d5cd | |||
| d0fc5338fe | |||
| b70a04696e | |||
| f63eed5cfd | |||
| f8533197f2 | |||
| b5d569218c | |||
| db6beb5b76 | |||
| cbfc5a9afb | |||
| b5d14aa8b8 | |||
| 6008cf128d | |||
| a7664aee7d | |||
| 7a3b81e833 | |||
| ef89ceffbd | |||
| 116933dcf6 | |||
| b8bdd00d19 | |||
| 717151b98c | |||
| aff12c6951 | |||
| 492e44c0f6 | |||
| b32bb2e84d |
7
.env
Normal file
7
.env
Normal file
@@ -0,0 +1,7 @@
|
||||
B200 Node
|
||||
IP: `45.76.247.107`
|
||||
user: `root`
|
||||
password: `6)Jr)B@dcX[mN?dx`
|
||||
folder with the weights: `/root/nvidia-meeting`
|
||||
|
||||
This repo: `https://sweetapi.com/biondizzle/deepseek-v4-quant.git` (always use this http to pull since its a public repo)
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -1 +1,10 @@
|
||||
.env
|
||||
# Dequantized BF16 weights (3TB)
|
||||
DeepSeek-V4-Pro-BF16/
|
||||
|
||||
# Calibration state (huge, not for git)
|
||||
*.pt
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.venv/
|
||||
|
||||
54
Dockerfile
Normal file
54
Dockerfile
Normal file
@@ -0,0 +1,54 @@
|
||||
# DeepSeek V4 NVFP4 vLLM + CUTLASS NVFP4 Mega MoE Kernel
|
||||
FROM vllm/vllm-openai:nightly-x86_64
|
||||
|
||||
# Remove broken nixl_ep (built against CUDA 12, image is CUDA 13)
|
||||
RUN pip uninstall -y nixl-ep; rm -rf /usr/local/lib/python3.12/dist-packages/nixl_ep
|
||||
|
||||
RUN apt-get update && apt-get install -y git screen cmake libcusolver-dev-13-0 libcusparse-dev-13-0 libcublas-dev-13-0 libcurand-dev-13-0 libcufft-dev-13-0 libnvjitlink-dev-13-0 && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Remove the broken symlink if it exists
|
||||
RUN rm -f /usr/local/cuda/lib64/libcudart.so.12
|
||||
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
ENV TORCH_CUDA_ARCH_LIST="10.0"
|
||||
|
||||
# Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
|
||||
ARG CUTLASS_CACHE_BUSTER=1
|
||||
RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass
|
||||
|
||||
# Clone our NVFP4 mega_moe kernel
|
||||
ARG KERNEL_CACHE_BUSTER=40
|
||||
RUN git clone https://sweetapi.com/biondizzle/nvfp4-megamoe-kernel.git /root/nvfp4-megamoe-kernel && \
|
||||
cd /root/nvfp4-megamoe-kernel && \
|
||||
pip install -e .
|
||||
|
||||
# Build the CUTLASS NVFP4 block-scaled GEMM extension
|
||||
RUN cd /root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm && \
|
||||
mkdir -p cutlass_nvfp4_gemm && \
|
||||
CUTLASS_INCLUDE_DIR=/root/cutlass/include \
|
||||
TORCH_CUDA_ARCH_LIST=10.0 \
|
||||
python3 setup.py build_ext --inplace
|
||||
|
||||
# Install TileLang (for potential future use)
|
||||
RUN pip install tilelang
|
||||
|
||||
ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
|
||||
|
||||
# Copy patches
|
||||
ARG PATCH_CACHE_BUSTER=82
|
||||
COPY patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
|
||||
COPY patches/staging_kernel.py /tmp/patches/staging_kernel.py
|
||||
COPY patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
|
||||
|
||||
# Apply patches
|
||||
RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
|
||||
VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
|
||||
cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
|
||||
cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
|
||||
cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
|
||||
rm -rf /tmp/patches
|
||||
|
||||
# Verify
|
||||
RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \
|
||||
python3 -c "import vllm; print('vLLM OK')" && \
|
||||
python3 -c "import nvfp4_megamoe_kernel; print('NVFP4 kernel OK')"
|
||||
29
MEMORY.md
Normal file
29
MEMORY.md
Normal file
@@ -0,0 +1,29 @@
|
||||
# MEMORY.md — Long-Term Memory
|
||||
|
||||
## Mike
|
||||
- Working on DeepSeek V4 Pro NVFP4 quantization + vLLM serving on B200 node
|
||||
- B200 node: 45.76.247.107, root, password in project .env
|
||||
- Repo: https://sweetapi.com/biondizzle/deepseek-v4-quant.git (modelopt-nvfp4 branch)
|
||||
|
||||
## DeepSeek V4 NVFP4 Project
|
||||
- Successfully quantized: 881GB NVFP4 (Run 11), 8× B200, $161/run
|
||||
- modelopt 0.45.0.dev64 + transformers 5.8.0.dev0
|
||||
- **vLLM server running on B200 port 8000** as of May 11, 2026 🎉
|
||||
- We built the entire NVFP4→vLLM bridge from scratch (NVIDIA hasn't done this)
|
||||
- Abandoned mega_moe (no kernel, format mismatch), using standard FusedMoE instead
|
||||
|
||||
### Key Technical Decisions
|
||||
- **wo_a**: NVFP4→BF16→FP8 with DeepGEMM block-scale format for BMM einsum
|
||||
- **Attention layers**: NVFP4→BF16 dequantization, UnquantizedLinearMethod
|
||||
- **Compressor**: Reconstructed fused_wkv_wgate from separate kv_proj+gate_proj in checkpoint
|
||||
- **MoE experts**: Stay NVFP4, use FLASHINFER_TRTLLM FusedMoE backend
|
||||
|
||||
### Critical Bugs Fixed (May 11)
|
||||
1. DeepGEMM `sf.dim()` crash: weight_scale_inv must be DeepGEMM-formatted block scale tensor
|
||||
2. Compressor indexer shape mismatch: checkpoint keys have `.indexer.` sub-path
|
||||
3. All-ones block scale → garbage output: must use `torch.full(..., fp8_scale)` not `torch.ones`
|
||||
4. Block scale dtype: must be float32, not float8_e4m3fn
|
||||
|
||||
### Outstanding
|
||||
- Output quality under investigation — FP4 is aggressive quantization
|
||||
- All code in patches/deepseek_v4.py on modelopt-nvfp4 branch
|
||||
328
README.md
328
README.md
@@ -1,106 +1,258 @@
|
||||
# DeepSeek V4 Pro → NVFP4 conversion kit
|
||||
# DeepSeek V4 Pro → NVFP4 Quantization + vLLM Serving
|
||||
|
||||
Two paths for converting `sgl-project/DeepSeek-V4-Pro-FP8` (the uniform-FP8 repackage of the original mixed-precision V4 Pro) into NVFP4 for Blackwell inference.
|
||||
Full NVFP4 quantization of DeepSeek V4 Pro and vLLM serving on 8× NVIDIA B200 GPUs.
|
||||
|
||||
## What's here
|
||||
## Quick Status
|
||||
|
||||
| Component | Status |
|
||||
|-----------|--------|
|
||||
| NVFP4 Quantization | ✅ 881GB (Run 11), modelopt 0.45.0.dev64 |
|
||||
| Weight Loading | ✅ 95 safetensors shards, all 8 TP ranks |
|
||||
| NVFP4→FP8 Conversion (wo_a) | ✅ DeepGEMM block-scale format |
|
||||
| NVFP4→BF16 Dequantization | ✅ 305 attn/shared, 91 compressor layers |
|
||||
| Compressor Reconstruction | ✅ Separate kv_proj/gate_proj → fused_wkv_wgate |
|
||||
| MoE Expert Serving | ✅ FusedMoE NVFP4 (FLASHINFER_TRTLLM backend) |
|
||||
| Profile/Warmup Run | ✅ Passes |
|
||||
| API Server | ✅ Running on port 8000 |
|
||||
| Output Quality | 🔧 Garbled — likely remaining dequant/scale bug |
|
||||
|
||||
## B200 Node
|
||||
|
||||
- **IP**: `45.76.247.107`
|
||||
- **User**: `root`
|
||||
- **Password**: see `.env`
|
||||
- **GPUs**: 8× NVIDIA B200 (SM100)
|
||||
- **RAM**: ~2.7 TB
|
||||
- **Model weights**: `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4/`
|
||||
- **BF16 reference**: `/root/nvidia-meeting/DeepSeek-V4-Pro-BF16/`
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
DeepSeek V4 Pro (1.2T params, 61 layers)
|
||||
├── MLA Attention (61 layers)
|
||||
│ ├── fused_wqa_wkv → BF16 (UnquantizedLinearMethod)
|
||||
│ ├── wo_a → FP8 (DeepGEMM block-scale, BMM einsum)
|
||||
│ ├── wo_b → BF16 (UnquantizedLinearMethod)
|
||||
│ └── compressor.fused_wkv_wgate → BF16 (reconstructed from NVFP4)
|
||||
├── MoE Experts (384 experts, 61 layers)
|
||||
│ ├── w13_weight → NVFP4 (FusedMoE, FLASHINFER_TRTLLM backend)
|
||||
│ └── w2_weight → NVFP4 (FusedMoE, FLASHINFER_TRTLLM backend)
|
||||
└── Shared Expert → FP8 (Fp8LinearMethod, DeepGEMM)
|
||||
```
|
||||
|
||||
## The NVFP4 → vLLM Gap
|
||||
|
||||
ModelOpt quantizes to NVFP4 (4-bit FP4 with block scales). vLLM's DeepSeek V4
|
||||
attention code expects FP8 with DeepGEMM block-scale einsum. These formats were
|
||||
**never integrated** — we're ahead of NVIDIA on this. Key gaps we had to bridge:
|
||||
|
||||
### 1. wo_a: NVFP4 → FP8 + DeepGEMM Block Scale
|
||||
|
||||
**Problem**: `wo_a` uses `deepseek_v4_fp8_einsum` (BMM with DeepGEMM), which expects:
|
||||
- Weight: `float8_e4m3fn` in 3D shape `(g, r, d)` for batched matmul
|
||||
- Scale: DeepGEMM-formatted block scale tensor (not a per-tensor scalar)
|
||||
|
||||
Our NVFP4 weights are uint8 packed FP4 with separate block/global scales.
|
||||
|
||||
**Solution** (`_convert_nvfp4_to_fp8`):
|
||||
1. Unpack NVFP4 uint8 → BF16 using E2M1 lookup table
|
||||
2. Dequantize: `weight_bf16 * block_scale * global_scale` (NO input_scale — it's for activations)
|
||||
3. Re-quantize BF16 → FP8 e4m3 with per-tensor scale (`w_amax / fp8_max`)
|
||||
4. Create block scale tensor filled with `fp8_scale` (same scale for every 128×128 block)
|
||||
5. Call `deepgemm_post_process_fp8_weight_block(wq, ws, quant_block_shape=(128,128), use_e8m0=True, is_bmm=True, bmm_batch_size=N)`
|
||||
6. Store: `weight_scale_inv = dg_ws` (DeepGEMM-formatted scale), `weight = w_fp8` (3D BMM shape)
|
||||
|
||||
**Why `weight_scale_inv`?** The attention forward reads `self.wo_a.weight_scale_inv` as
|
||||
`b_scale` for `deepseek_v4_fp8_einsum` → DeepGEMM `fp8_einsum`. This must be the
|
||||
DeepGEMM block-scale tensor, not a per-tensor scalar.
|
||||
|
||||
**Why `fp8_scale` in the block scale (not all-ones)?** DeepGEMM divides by the block
|
||||
scale at runtime. If the block scale is all-ones, it divides by 1.0, producing garbage.
|
||||
Each block needs the actual per-tensor scale value.
|
||||
|
||||
### 2. Attention Layers: NVFP4 → BF16
|
||||
|
||||
**Problem**: `fused_wqa_wkv`, `wo_b` use standard `torch.nn.functional.linear`.
|
||||
NVFP4 weights (uint8) can't be used directly.
|
||||
|
||||
**Solution** (`_convert_nvfp4_to_bf16`):
|
||||
1. Unpack NVFP4 → BF16
|
||||
2. Dequantize with block/global scales (input_scale is for activations, not weights)
|
||||
3. Replace `mod.weight` with BF16 parameter
|
||||
4. Set `quant_method = UnquantizedLinearMethod()`
|
||||
5. Remove NVFP4 scale attributes (`weight_scale`, `weight_scale_2`, `input_scale`)
|
||||
|
||||
### 3. Compressor: Reconstructing fused_wkv_wgate from NVFP4
|
||||
|
||||
**Problem**: The compressor's `fused_wkv_wgate` is a `MergedColumnParallelLinear`
|
||||
with `disable_tp=True`. NVFP4 uint8 data can't be loaded into the BF16 parameter
|
||||
(shape mismatch: uint8 is half the input dim). The default weight loader silently
|
||||
skips these weights, leaving the parameter uninitialized.
|
||||
|
||||
**Solution** (`_reconstruct_compressor_weight`):
|
||||
1. Read original `kv_proj.weight` and `gate_proj.weight` directly from safetensors
|
||||
2. Unpack NVFP4 → BF16, dequantize with scales
|
||||
3. Concatenate: `fused = cat([wkv, wgate], dim=0)`
|
||||
4. Replace the uninitialized parameter
|
||||
|
||||
**Critical detail**: The **indexer** compressor is at a different checkpoint path:
|
||||
- Main: `model.layers.N.self_attn.compressor.{kv_proj,gate_proj}.weight`
|
||||
- Indexer: `model.layers.N.self_attn.compressor.indexer.{kv_proj,gate_proj}.weight`
|
||||
|
||||
Using the wrong prefix loads the main compressor weight into the indexer's
|
||||
`fused_wkv_wgate`, causing a 4× shape mismatch and `split_with_sizes` crash.
|
||||
|
||||
### 4. MoE Experts: NVFP4 FusedMoE
|
||||
|
||||
**Problem**: vLLM's DeepSeek V4 uses `DeepseekV4MegaMoEExperts` with DeepGEMM
|
||||
grouped GEMM. NVFP4 experts need a different kernel path.
|
||||
|
||||
**Solution**: The existing `ModelOptNvFp4LinearMethod` + `FusedMoE` infrastructure
|
||||
handles NVFP4 experts natively. We just need to:
|
||||
- Keep expert weights as NVFP4 uint8 + block/global scales
|
||||
- Use `FLASHINFER_TRTLLM` MoE backend (auto-selected)
|
||||
- Skip any conversion in `process_weights_after_loading`
|
||||
|
||||
### 5. BF16 wo_a Layers: BF16 → FP8
|
||||
|
||||
**Problem**: Some `wo_a` layers were NOT quantized by modelopt (BF16 in checkpoint).
|
||||
The attention forward still reads them as FP8 for the einsum path.
|
||||
|
||||
**Solution** (`_convert_bf16_to_fp8`): Same as #1 but skip the NVFP4 unpack step.
|
||||
Directly quantize BF16 → FP8 with block scale.
|
||||
|
||||
## Bugs Found and Fixed
|
||||
|
||||
### DeepGEMM `sf.dim()` Assertion (layout.hpp:94)
|
||||
- **Root cause**: `weight_scale_inv` was a 1D per-tensor scale `(g,)`. DeepGEMM expects
|
||||
2D/3D block-scale tensor formatted by `transform_sf_into_required_layout`.
|
||||
- **Fix**: Use `deepgemm_post_process_fp8_weight_block` to produce correctly formatted
|
||||
block scales, store result in `weight_scale_inv`.
|
||||
|
||||
### Block Scale dtype (`float8_e4m3fn` vs `float32`)
|
||||
- **Root cause**: `deepgemm_post_process_fp8_weight_block` expects `float32` or
|
||||
`float8_e8m0fnu` block scales. We initially used `float8_e4m3fn`.
|
||||
- **Fix**: Create block scale as `dtype=torch.float32`.
|
||||
|
||||
### Missing `deepgemm_post_process` args
|
||||
- **Root cause**: Function signature changed to require `quant_block_shape` and `use_e8m0`.
|
||||
- **Fix**: Pass `quant_block_shape=(128, 128)` and `use_e8m0=True`.
|
||||
|
||||
### Compressor Indexer Shape Mismatch
|
||||
- **Root cause**: `_reconstruct_compressor_weight` used the same checkpoint prefix
|
||||
for both main and indexer compressors. The indexer's keys have `.indexer.` in the path.
|
||||
- **Fix**: Add `sub_path` parameter; pass `".indexer"` for indexer compressors.
|
||||
|
||||
### All-Ones Block Scale → Garbage Output
|
||||
- **Root cause**: Block scale was `torch.ones(...)` (scale=1.0). DeepGEMM divides by
|
||||
the block scale at runtime, so the output was divided by 1.0 instead of the actual
|
||||
per-tensor scale, producing incoherent text.
|
||||
- **Fix**: Use `torch.full(..., fp8_scale.item())` to fill the block scale with the
|
||||
correct per-tensor FP8 quantization scale.
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# On B200 node
|
||||
cd /root/nvidia-meeting
|
||||
docker compose up -d
|
||||
|
||||
# Check logs
|
||||
docker logs -f nvidia-meeting-vllm-1
|
||||
|
||||
# Test
|
||||
curl http://localhost:8000/v1/models
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "/model", "messages": [{"role": "user", "content": "Hello"}], "max_tokens": 50}'
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
| --- | --- |
|
||||
| `inspect_model.py` | Run this first. Prints tensor name patterns, dtypes, FP8 scaling block sizes, and counts of MoE expert/router/norm tensors so you know exactly what you're dealing with before any conversion. |
|
||||
| `fp8_to_nvfp4_streaming.py` | **Path A.** Pure tensor-level streaming FP8 → NVFP4 conversion. No model loading, no calibration, weight-only. Low memory, fast, deterministic. Recommended for first run. |
|
||||
| `quantize_llmcompressor.py` | **Path B.** `llm-compressor` oneshot with sequential pipeline + activation calibration. Produces W4A4 with calibrated activation scales. Higher quality on activation-sensitive ops but riskier given V4 is two weeks old. |
|
||||
| `verify_nvfp4.py` | Loads the produced NVFP4 checkpoint, runs a basic forward pass through one block, checks for NaN/Inf, and dumps a few generated tokens via vLLM. |
|
||||
|------|---------|
|
||||
| `patches/deepseek_v4.py` | Main patch: NVFP4 post-load conversion, weight reconstruction, DeepGEMM block-scale |
|
||||
| `patches/modelopt.py` | ModelOpt FP4 config patches for weight loading |
|
||||
| `.env` | B200 node credentials |
|
||||
| `docker-compose.yml` | Container config (8 GPU, TP=8, EP=8, NVFP4 quant) |
|
||||
|
||||
## Hardware assumptions
|
||||
## Conversion Flow
|
||||
|
||||
- 8× B200 baremetal, 1.5 TB HBM total
|
||||
- 2.7 TB system RAM
|
||||
- ≥10 TB free NVMe at `~/nvidia-meeting/`
|
||||
|
||||
## Prereqs
|
||||
|
||||
```bash
|
||||
source ~/nvidia-meeting/venv/bin/activate
|
||||
pip install --upgrade torch safetensors transformers tqdm
|
||||
pip install --upgrade llmcompressor compressed-tensors # only needed for Path B
|
||||
pip install --upgrade vllm # only needed for verify
|
||||
```
|
||||
Checkpoint (NVFP4 safetensors)
|
||||
│
|
||||
├── [weight loader] ──→ vLLM model (NVFP4 uint8 params)
|
||||
│
|
||||
└── [process_weights_after_loading]
|
||||
├── wo_a (is_bmm=True):
|
||||
│ NVFP4→BF16→FP8 + DeepGEMM block scale
|
||||
│ weight_scale_inv = dg_ws, weight = 3D FP8
|
||||
│
|
||||
├── fused_wqa_wkv, wo_b, shared_expert:
|
||||
│ NVFP4→BF16, UnquantizedLinearMethod
|
||||
│
|
||||
├── compressor.fused_wkv_wgate:
|
||||
│ Read kv_proj+gate_proj from checkpoint
|
||||
│ NVFP4→BF16, cat into fused weight
|
||||
│
|
||||
└── MoE experts: stay NVFP4 (FusedMoE backend)
|
||||
```
|
||||
|
||||
You'll likely need `transformers` from source for V4 architecture support, and `trust_remote_code=True` everywhere. Stock pip versions may not load V4 yet.
|
||||
## Bugs Found and Fixed (continued)
|
||||
|
||||
## Recommended order tonight
|
||||
### `input_scale` Multiplied into Weight Dequantization (CRITICAL)
|
||||
- **Root cause**: `_convert_nvfp4_to_bf16`, `_convert_nvfp4_to_fp8`, and
|
||||
`_reconstruct_compressor_weight` all multiplied by `input_scale` during weight
|
||||
dequantization. `input_scale` is for **activations**, not weights. The correct
|
||||
formula is: `weight_bf16 = e2m1 * block_scale * global_scale` (NO input_scale).
|
||||
Including it made weights ~5000× too small, causing garbage output.
|
||||
- **Fix**: Removed `* input_scale` from all three dequant paths.
|
||||
|
||||
```bash
|
||||
cd ~/nvidia-meeting
|
||||
### `fused_skip_regex` Skipping Non-Fused Layer Scales (CRITICAL)
|
||||
- **Root cause**: The skip list included `q_b_proj`, `o_a_proj`, `o_b_proj` weight
|
||||
scales. These are **NOT fused/stacked** — they're individual Linear layers
|
||||
(`wq_b`, `wo_a`, `wo_b`) converted in-place. Skipping their scales caused
|
||||
`process_weights_after_loading` to read `torch.empty()` garbage for
|
||||
`weight_scale_inv`, producing garbled output.
|
||||
- **Fix**: Removed `q_b_proj`, `o_a_proj`, `o_b_proj` scale entries from
|
||||
`fused_skip_regex`. Only truly stacked params remain skipped:
|
||||
`compressor.{kv_proj,gate_proj}` → `fused_wkv_wgate`,
|
||||
`self_attn.{kv_proj,q_a_proj}` → `fused_wqa_wkv`,
|
||||
`shared_experts.{gate_proj,up_proj}` → `gate_up_proj`.
|
||||
|
||||
# 1. Inspect the FP8 source — 30 seconds, no GPU needed.
|
||||
python inspect_model.py DeepSeek-V4-Pro-FP8 | tee inspect.log
|
||||
## Version Banner
|
||||
|
||||
# 2. Path A streaming conversion — should run in 2-6 hours dominated by NVMe I/O.
|
||||
python fp8_to_nvfp4_streaming.py \
|
||||
--src DeepSeek-V4-Pro-FP8 \
|
||||
--dst DeepSeek-V4-Pro-NVFP4-streaming \
|
||||
--workers 8 \
|
||||
2>&1 | tee path_a.log
|
||||
|
||||
# 3. Quick sanity check — does it load and forward-pass?
|
||||
python verify_nvfp4.py DeepSeek-V4-Pro-NVFP4-streaming
|
||||
|
||||
# 4. Path B (overnight). Run only after Path A succeeds. 24-72 hours.
|
||||
python quantize_llmcompressor.py \
|
||||
--src DeepSeek-V4-Pro-FP8 \
|
||||
--dst DeepSeek-V4-Pro-NVFP4-llmcompressor \
|
||||
--num-samples 256 \
|
||||
--max-seq-len 4096 \
|
||||
2>&1 | tee path_b.log
|
||||
The patch prints a version banner at import time (visible in `docker logs`):
|
||||
```
|
||||
======================================================================
|
||||
DeepSeek V4 NVFP4 Patch
|
||||
Commit: 26aaaba
|
||||
Loaded: 2026-05-11 04:25:00 UTC
|
||||
Node: ...
|
||||
|
||||
Architecture: ...
|
||||
Bugs fixed: #1-#6
|
||||
======================================================================
|
||||
```
|
||||
This ensures you can always verify what's running inside the container.
|
||||
|
||||
## Path A — what it does
|
||||
## Known Issues
|
||||
|
||||
1. Reads `model.safetensors.index.json` to map every tensor to its shard.
|
||||
2. Classifies every tensor:
|
||||
- **Preserve** (copied bit-for-bit): `lm_head`, `embed_tokens`, MoE router gates (`*.mlp.gate`), all norms, V4-specific attention indexer/scoring tensors, mHC residual mixing weights.
|
||||
- **Quantize**: any FP8 weight that has a corresponding `*.weight_scale_inv` companion (i.e. real GEMM weights).
|
||||
3. For every quantizable weight:
|
||||
- Dequantizes FP8 E4M3 → FP32 using the source's per-block scales (auto-detects 128×128 blocks).
|
||||
- Computes NVFP4 dual scales: per-tensor `weight_scale_2 = amax / (6.0 * 448.0)` and per-16-element-block `weight_scale = block_amax / (6.0 * weight_scale_2)` cast to FP8 E4M3.
|
||||
- Quantizes FP32 → E2M1 representable values `{0, ±0.5, ±1, ±1.5, ±2, ±3, ±4, ±6}`.
|
||||
- Packs two 4-bit values per `uint8` byte.
|
||||
4. **MoE pair handling**: detects `gate_proj` (w1) + `up_proj` (w3) of each expert and computes a joint `weight_scale_2` across both, since vLLM's fused MoE kernel requires them to share that global scale.
|
||||
5. Streams output to new shards (~5 GB each) with a fresh `model.safetensors.index.json` and copies all non-tensor files (config, tokenizer, etc.) verbatim.
|
||||
1. **Output quality**: Model produces tokens but they're garbled/incoherent.
|
||||
All 6 known bugs are fixed. The remaining issue is under investigation —
|
||||
likely a subtle dequantization bug (sign handling, scale ordering, or
|
||||
E2M1 unpack edge case). The version banner in the logs helps debug which
|
||||
patch version is active.
|
||||
|
||||
**This is weight-only NVFP4.** Activation quantization is not done here — you get W4A16 effective behavior at runtime unless your inference engine generates dynamic per-group activation scales. vLLM does generate per-group activation scales dynamically at inference, so this is fine for most use cases.
|
||||
2. **Runtime performance**: Not yet benchmarked. The DeepGEMM einsum + FusedMoE
|
||||
path should be efficient on B200, but the BF16 layers go through
|
||||
`UnquantizedLinearMethod` which may be slower than dedicated kernels.
|
||||
|
||||
## Path B — what it does
|
||||
## Quantization Details
|
||||
|
||||
1. Loads the FP8 model via `transformers` with `device_map="auto"` and the offload folder pointing at NVMe. With 2.7 TB RAM, the FP8 weights (~865 GB) sit in RAM; activations and per-layer BF16 promotion happen on the B200s.
|
||||
2. Loads a calibration set (default 256 samples of `HuggingFaceH4/ultrachat_200k`).
|
||||
3. Runs `llm-compressor` `oneshot` with `pipeline="sequential"` so only one transformer block is materialized in BF16 on GPU at a time.
|
||||
4. `moe_calibrate_all_experts=True` ensures every routed expert gets calibration signal even when natural routing wouldn't pick it.
|
||||
5. The recipe targets `Linear` with NVFP4 and the same ignore list as Path A (lm_head, embed, router gates, norms, indexer, mHC).
|
||||
6. Saves with `save_compressed=True` in `compressed-tensors` format.
|
||||
|
||||
**The known risks for Path B on V4 specifically:**
|
||||
|
||||
- V4 architecture is brand new. `llm-compressor` may not have a registered MoE wrapper for V4 — you may need to call `replace_modules_for_calibration` with the actual V4 MoE class name (the script has a TODO and a fallback path).
|
||||
- Sequential pipeline may not handle CSA/HCA hybrid attention if the attention forward isn't a simple linear chain. If you see weird offload errors during calibration, the indexer/scoring tensors are likely the culprit.
|
||||
- Calibration cache for 256 routed experts × all V4 layers can be hundreds of GB. Watch `nvidia-smi` and `free -h` during the first 30 minutes.
|
||||
|
||||
## Things to discuss with the NVIDIA engineer
|
||||
|
||||
1. **NVFP4 packing convention.** My converter packs as `byte = elem0 | (elem1 << 4)` (low nibble first). Verify this matches what TensorRT-LLM / cutlass NVFP4 kernels expect. If reversed, just flip in `pack_fp4()`.
|
||||
2. **Joint scaling extension.** I implement joint `weight_scale_2` for `gate_proj`/`up_proj` pairs. Ask whether `down_proj` also benefits, or whether all three experts in a fused MoE block should share — recipes have varied.
|
||||
3. **mHC residual weights.** I preserve them in FP8/BF16 conservatively. If NVIDIA has actually quantized these somewhere internally, drop them out of the ignore list to recover memory.
|
||||
4. **CSA + HCA indexer/scoring tensors.** I preserve these blindly based on the V3.2 DSA precedent. Ask whether V4's compressed-sparse / heavily-compressed attention has analogous "cannot quantize" tensors and what the canonical regex is.
|
||||
5. **W4A4 vs W4A16 for V4 Pro.** Path A is W4A16-equivalent; Path B is W4A4. For a 1.6T MoE with extreme long-context, ask which is internally recommended for first deployment.
|
||||
6. **`modelopt` vs `llm-compressor` for V4.** RedHat shipped V4-*Flash* NVFP4 via `llm-compressor`. Why not Pro yet? Find out if there's a known-bad layer or just compute time.
|
||||
|
||||
## Output sizes to expect
|
||||
|
||||
- FP8 source: ~865 GB
|
||||
- Path A NVFP4 output: ~430–470 GB (about 2× compression vs FP8 source; experts dominate, norms/embeds add a bit back)
|
||||
- Path B NVFP4 output: similar, plus activation scale metadata
|
||||
|
||||
## Resumability
|
||||
|
||||
Path A is checkpoint-resumable per shard — if it dies mid-run, re-running picks up from the next unwritten output shard. Path B is **not** resumable mid-calibration; if it crashes you restart.
|
||||
- **Model**: DeepSeek V4 Pro (1.2T parameters)
|
||||
- **Format**: NVIDIA NVFP4 (4-bit floating point with 128-element block scales)
|
||||
- **Tool**: modelopt 0.45.0.dev64 + transformers 5.8.0.dev0
|
||||
- **Run**: Run 11 (881GB), 8× B200, ~$161/run
|
||||
- **Checkpoint**: 95 safetensors shards
|
||||
|
||||
30
docker-compose.yml
Normal file
30
docker-compose.yml
Normal file
@@ -0,0 +1,30 @@
|
||||
services:
|
||||
vllm:
|
||||
build:
|
||||
context: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- OMP_NUM_THREADS=128
|
||||
- MEGA_MOE_DEBUG=1
|
||||
- MEGA_MOE_STATIC=0
|
||||
- MEGA_MOE_USE_CUTLASS=1
|
||||
- DG_JIT_DEBUG=1
|
||||
command:
|
||||
- /model
|
||||
- --trust-remote-code
|
||||
- --enable-expert-parallel
|
||||
- --tensor-parallel-size=8
|
||||
- --enforce-eager
|
||||
- --tokenizer-mode=deepseek_v4
|
||||
- --host=0.0.0.0
|
||||
- --port=8000
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
volumes:
|
||||
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
||||
@@ -1,540 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Streaming FP8 → NVFP4 converter for DeepSeek V4 Pro (sgl-project FP8 repackage).
|
||||
|
||||
Path A: pure tensor-level conversion. No model loading via transformers, no
|
||||
calibration. Reads FP8 safetensors shards, dequantizes per-block FP8 to FP32,
|
||||
re-quantizes to NVFP4 (E2M1 packed in uint8 with FP8 E4M3 per-block scales and
|
||||
an FP32 per-tensor global scale), and writes new shards.
|
||||
|
||||
Key behaviors:
|
||||
- Joint global scale_2 across (gate_proj, up_proj) pairs of each expert,
|
||||
required for vLLM fused MoE kernels.
|
||||
- Preserves lm_head, embeddings, MoE router gates, norms, V4 indexer/scoring,
|
||||
and mHC residual mixing weights at original precision.
|
||||
- Streams shard-by-shard. Peak working memory is one tensor pair dequantized
|
||||
to FP32 (a few hundred MB at most for the largest weights).
|
||||
- Resumable per output shard.
|
||||
|
||||
NVFP4 format reference:
|
||||
value = packed_fp4 * weight_scale * weight_scale_2
|
||||
where:
|
||||
packed_fp4: E2M1 in {0, ±0.5, ±1, ±1.5, ±2, ±3, ±4, ±6}, 2 per byte
|
||||
weight_scale: FP8 E4M3, one per 16-element block
|
||||
weight_scale_2: FP32 scalar per tensor, global
|
||||
|
||||
Usage:
|
||||
python fp8_to_nvfp4_streaming.py \\
|
||||
--src DeepSeek-V4-Pro-FP8 \\
|
||||
--dst DeepSeek-V4-Pro-NVFP4-streaming \\
|
||||
--workers 8
|
||||
|
||||
Optional:
|
||||
--gpu N Use CUDA device N for the math (default: 0; -1 for CPU)
|
||||
--shard-size-gb 5 Target output shard size
|
||||
--dry-run Print what would be done; don't write
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classification: which tensors do we quantize, which do we preserve?
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PRESERVE_REGEXES = [
|
||||
r".*lm_head.*",
|
||||
r".*embed_tokens.*",
|
||||
r".*\.(mlp|ffn)\.gate(\.weight)?$", # MoE router (NOT gate_proj)
|
||||
r".*norm.*",
|
||||
r".*indexer.*", # V3.2 DSA / V4 CSA indexer
|
||||
r".*hyper_conn.*", # V4 mHC
|
||||
r".*\.mhc.*",
|
||||
r".*hc_attn.*", # V4 hyper-connection attn
|
||||
r".*hc_ffn.*", # V4 hyper-connection ffn
|
||||
r".*hc_head.*", # V4 hyper-connection head
|
||||
r".*scoring.*",
|
||||
r".*attn_sink.*", # V4 attention sink
|
||||
r".*compressor\.ape.*", # V4 compressor absolute pos encoding
|
||||
r".*tid2eid.*", # V4 MoE token-to-expert mapping
|
||||
r".*\.bias$", # any biases
|
||||
]
|
||||
PRESERVE_RE = re.compile("|".join(f"(?:{p})" for p in PRESERVE_REGEXES))
|
||||
|
||||
# Identify expert pairs that need joint global scale
|
||||
EXPERT_PAIR_RE = re.compile(r"(.*experts\.\d+)\.(w1|w3)\.weight$")
|
||||
|
||||
|
||||
def is_preserve(name: str) -> bool:
|
||||
return bool(PRESERVE_RE.match(name))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FP8 dequantization (per-block)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def dequant_fp8_to_fp32(weight_fp8: torch.Tensor, scale_inv: torch.Tensor) -> torch.Tensor:
|
||||
"""Dequantize a per-block FP8 E4M3 weight to FP32 using its inverse-scale tensor.
|
||||
|
||||
DeepSeek convention: weight_scale_inv stores the dequant scale (multiply by it
|
||||
to recover FP32). Block size is inferred from shape ratios — typically 128x128.
|
||||
"""
|
||||
assert weight_fp8.dim() == 2, f"Expected 2D weight, got shape {weight_fp8.shape}"
|
||||
M, N = weight_fp8.shape
|
||||
|
||||
if scale_inv.dim() == 0:
|
||||
# Per-tensor scale
|
||||
return weight_fp8.float() * scale_inv.float()
|
||||
|
||||
if scale_inv.dim() == 1:
|
||||
# Per-row or per-col — unusual for DeepSeek but handle it
|
||||
if scale_inv.numel() == M:
|
||||
return weight_fp8.float() * scale_inv.float().unsqueeze(1)
|
||||
if scale_inv.numel() == N:
|
||||
return weight_fp8.float() * scale_inv.float().unsqueeze(0)
|
||||
raise ValueError(f"Cannot align 1D scale_inv {scale_inv.shape} to weight {weight_fp8.shape}")
|
||||
|
||||
# 2D block scaling
|
||||
sm, sn = scale_inv.shape
|
||||
bm = (M + sm - 1) // sm
|
||||
bn = (N + sn - 1) // sn
|
||||
scale_full = scale_inv.float().repeat_interleave(bm, dim=0).repeat_interleave(bn, dim=1)
|
||||
scale_full = scale_full[:M, :N]
|
||||
return weight_fp8.float() * scale_full
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# NVFP4 quantization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FP4_E2M1_VALUES = torch.tensor(
|
||||
[0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
|
||||
)
|
||||
# Boundaries between adjacent magnitudes (round-to-nearest with ties to even-ish)
|
||||
FP4_BOUNDARIES = torch.tensor(
|
||||
[0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0], dtype=torch.float32
|
||||
)
|
||||
FP4_MAX = 6.0
|
||||
FP8_E4M3_MAX = 448.0
|
||||
|
||||
|
||||
def round_to_fp4_e2m1_index(x: torch.Tensor) -> torch.Tensor:
|
||||
"""Round x to nearest FP4 E2M1 representable, return 4-bit index in [0..15].
|
||||
|
||||
Index encoding: bit 3 = sign, bits 0..2 = magnitude index into FP4_E2M1_VALUES.
|
||||
"""
|
||||
sign = (x < 0).to(torch.uint8)
|
||||
abs_x = x.abs().clamp_(max=FP4_MAX)
|
||||
# searchsorted is fast on GPU; uses float32
|
||||
boundaries = FP4_BOUNDARIES.to(x.device)
|
||||
mag_idx = torch.searchsorted(boundaries, abs_x.contiguous()).to(torch.uint8)
|
||||
return (sign << 3) | mag_idx
|
||||
|
||||
|
||||
def quantize_to_nvfp4(
|
||||
x_fp32: torch.Tensor,
|
||||
scale_2: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Quantize an FP32 weight to NVFP4 given a (possibly joint) global scale.
|
||||
|
||||
Args:
|
||||
x_fp32: [M, N] FP32 tensor, N must be divisible by 16
|
||||
scale_2: scalar FP32 tensor
|
||||
|
||||
Returns:
|
||||
packed: [M, N//2] uint8, two FP4 values per byte (low nibble first)
|
||||
weight_scale: [M, N//16] FP8 E4M3 per-block scales
|
||||
"""
|
||||
M, N = x_fp32.shape
|
||||
if N % 16 != 0:
|
||||
raise ValueError(f"NVFP4 requires N % 16 == 0; got {x_fp32.shape}")
|
||||
|
||||
# Per-block (16-element) amax
|
||||
blocks = x_fp32.view(M, N // 16, 16)
|
||||
block_amax = blocks.abs().amax(dim=-1) # [M, N//16]
|
||||
|
||||
# Per-block scale in FP32, then cast to FP8 E4M3 (this is the lossy step)
|
||||
block_scale_fp32 = block_amax / (FP4_MAX * scale_2)
|
||||
# Avoid zeros — produces NaN on dequant. Clamp tiny scales.
|
||||
block_scale_fp32 = block_scale_fp32.clamp_(min=1e-30)
|
||||
block_scale_fp8 = block_scale_fp32.to(torch.float8_e4m3fn)
|
||||
|
||||
# Recover the effective scale that the kernel will actually use
|
||||
effective = scale_2 * block_scale_fp8.float() # [M, N//16]
|
||||
|
||||
# Quantize values: divide, clamp, round to E2M1
|
||||
scaled = blocks / effective.unsqueeze(-1).clamp_(min=1e-30)
|
||||
fp4_idx = round_to_fp4_e2m1_index(scaled) # [M, N//16, 16] uint8
|
||||
fp4_idx = fp4_idx.view(M, N).contiguous()
|
||||
|
||||
# Pack two nibbles per byte: low = even-index element, high = odd-index element
|
||||
low = fp4_idx[:, ::2]
|
||||
high = fp4_idx[:, 1::2]
|
||||
packed = (low | (high << 4)).to(torch.uint8)
|
||||
|
||||
return packed, block_scale_fp8
|
||||
|
||||
|
||||
def compute_global_scale(*tensors_fp32: torch.Tensor) -> torch.Tensor:
|
||||
"""Compute joint NVFP4 global scale_2 across one or more FP32 tensors.
|
||||
|
||||
scale_2 = amax / (FP4_MAX * FP8_E4M3_MAX)
|
||||
"""
|
||||
amax = torch.stack([t.abs().max() for t in tensors_fp32]).max()
|
||||
scale_2 = amax / (FP4_MAX * FP8_E4M3_MAX)
|
||||
# Avoid zero
|
||||
return scale_2.clamp_(min=1e-30).float()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sharded output writer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ShardedSafetensorsWriter:
|
||||
"""Writes tensors to a sequence of safetensors shards, building an index map."""
|
||||
|
||||
def __init__(self, out_dir: Path, max_shard_bytes: int):
|
||||
self.out_dir = out_dir
|
||||
self.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.max_shard_bytes = max_shard_bytes
|
||||
self.current = {} # name -> tensor (CPU)
|
||||
self.current_bytes = 0
|
||||
self.shard_idx = 0
|
||||
self.weight_map: dict[str, str] = {} # name -> shard filename
|
||||
self.shard_filenames: list[str] = []
|
||||
|
||||
def _flush(self):
|
||||
if not self.current:
|
||||
return
|
||||
self.shard_idx += 1
|
||||
# Use placeholder total; we'll rename at the end
|
||||
fname = f"model-{self.shard_idx:05d}-of-PLACEHOLDER.safetensors"
|
||||
path = self.out_dir / fname
|
||||
save_file(self.current, str(path))
|
||||
for name in self.current:
|
||||
self.weight_map[name] = fname
|
||||
self.shard_filenames.append(fname)
|
||||
self.current.clear()
|
||||
self.current_bytes = 0
|
||||
|
||||
def add(self, name: str, tensor: torch.Tensor):
|
||||
# safetensors requires CPU tensors and contiguous
|
||||
t = tensor.detach().cpu().contiguous()
|
||||
size = t.numel() * t.element_size()
|
||||
if self.current and self.current_bytes + size > self.max_shard_bytes:
|
||||
self._flush()
|
||||
self.current[name] = t
|
||||
self.current_bytes += size
|
||||
|
||||
def close(self):
|
||||
self._flush()
|
||||
# Now rename shards to use proper of-N suffix
|
||||
total = len(self.shard_filenames)
|
||||
new_map = {}
|
||||
for old_fname in self.shard_filenames:
|
||||
idx = int(old_fname.split("-")[1])
|
||||
new_fname = f"model-{idx:05d}-of-{total:05d}.safetensors"
|
||||
(self.out_dir / old_fname).rename(self.out_dir / new_fname)
|
||||
new_map[old_fname] = new_fname
|
||||
|
||||
# Patch weight_map
|
||||
self.weight_map = {k: new_map[v] for k, v in self.weight_map.items()}
|
||||
return self.weight_map
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shard-level conversion plan
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_plan(src_dir: Path):
|
||||
"""Build the conversion plan from index.json.
|
||||
|
||||
Returns:
|
||||
weight_map: name -> shard filename
|
||||
shard_to_names: shard filename -> list of names in that shard
|
||||
expert_pair_groups: list of (group_name, name_w1, name_w3)
|
||||
For each expert, the gate_proj/up_proj pair gets a shared scale_2.
|
||||
solo_quantize: list of names to quantize independently
|
||||
preserve: list of names to copy unchanged
|
||||
"""
|
||||
with open(src_dir / "model.safetensors.index.json") as f:
|
||||
index = json.load(f)
|
||||
weight_map = index["weight_map"]
|
||||
|
||||
shard_to_names = defaultdict(list)
|
||||
for name, fn in weight_map.items():
|
||||
shard_to_names[fn].append(name)
|
||||
|
||||
# Gather all weight tensor names (those with .weight suffix)
|
||||
all_weights = [n for n in weight_map if n.endswith(".weight")]
|
||||
|
||||
# Identify expert pairs
|
||||
expert_pairs = defaultdict(dict) # base -> {"gate_proj": name, "up_proj": name}
|
||||
for n in all_weights:
|
||||
m = EXPERT_PAIR_RE.match(n)
|
||||
if m:
|
||||
base, kind = m.group(1), m.group(2)
|
||||
expert_pairs[base][kind] = n
|
||||
|
||||
paired_names = set()
|
||||
expert_pair_groups = []
|
||||
for base, parts in expert_pairs.items():
|
||||
if "w1" in parts and "w3" in parts:
|
||||
expert_pair_groups.append((base, parts["w1"], parts["w3"]))
|
||||
paired_names.add(parts["w1"])
|
||||
paired_names.add(parts["w3"])
|
||||
|
||||
# Classify everything else
|
||||
solo_quantize = []
|
||||
preserve = []
|
||||
scale_companions = [] # .scale tensors that get consumed during dequant
|
||||
|
||||
for n in weight_map:
|
||||
if n.endswith(".scale") and n.replace(".scale", ".weight") in weight_map:
|
||||
scale_companions.append(n)
|
||||
continue
|
||||
if n in paired_names:
|
||||
continue
|
||||
if is_preserve(n):
|
||||
preserve.append(n)
|
||||
continue
|
||||
# Anything else with .weight gets quantized solo, otherwise preserved
|
||||
if n.endswith(".weight"):
|
||||
solo_quantize.append(n)
|
||||
else:
|
||||
preserve.append(n)
|
||||
|
||||
return {
|
||||
"weight_map": weight_map,
|
||||
"shard_to_names": dict(shard_to_names),
|
||||
"expert_pair_groups": expert_pair_groups,
|
||||
"solo_quantize": solo_quantize,
|
||||
"preserve": preserve,
|
||||
"scale_companions": scale_companions,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tensor loading helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ShardCache:
|
||||
"""Lazy per-shard safe_open cache so we don't re-open shards repeatedly."""
|
||||
|
||||
def __init__(self, src_dir: Path, max_open: int = 4):
|
||||
self.src_dir = src_dir
|
||||
self.max_open = max_open
|
||||
self.handles: dict[str, "safe_open"] = {}
|
||||
|
||||
def get(self, shard_fname: str):
|
||||
if shard_fname in self.handles:
|
||||
return self.handles[shard_fname]
|
||||
if len(self.handles) >= self.max_open:
|
||||
# Drop one
|
||||
old_fn = next(iter(self.handles))
|
||||
self.handles[old_fn].__exit__(None, None, None)
|
||||
del self.handles[old_fn]
|
||||
h = safe_open(self.src_dir / shard_fname, framework="pt")
|
||||
h.__enter__()
|
||||
self.handles[shard_fname] = h
|
||||
return h
|
||||
|
||||
def close(self):
|
||||
for h in self.handles.values():
|
||||
h.__exit__(None, None, None)
|
||||
self.handles.clear()
|
||||
|
||||
|
||||
def load_weight_and_scale(cache: ShardCache, weight_map, name):
|
||||
"""Load an FP8 weight with its scale companion (if any)."""
|
||||
weight = cache.get(weight_map[name]).get_tensor(name)
|
||||
scale_name = name.replace(".weight", ".scale")
|
||||
scale = None
|
||||
if scale_name in weight_map:
|
||||
try:
|
||||
scale = cache.get(weight_map[scale_name]).get_tensor(scale_name)
|
||||
except Exception:
|
||||
# Scale listed in index but not in shard (BF16 weights have no scale)
|
||||
pass
|
||||
return weight, scale
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--src", required=True, help="Source FP8 model directory")
|
||||
ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
|
||||
ap.add_argument("--gpu", type=int, default=0, help="CUDA device, -1 for CPU")
|
||||
ap.add_argument("--shard-size-gb", type=float, default=5.0)
|
||||
ap.add_argument("--workers", type=int, default=4,
|
||||
help="Concurrent tensor-conversion workers (lots of small tensors benefit; "
|
||||
"actual GPU compute is serialized by torch)")
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
src = Path(args.src).resolve()
|
||||
dst = Path(args.dst).resolve()
|
||||
if not (src / "model.safetensors.index.json").exists():
|
||||
sys.exit(f"No index.json at {src}")
|
||||
|
||||
device = torch.device(f"cuda:{args.gpu}" if args.gpu >= 0 and torch.cuda.is_available() else "cpu")
|
||||
print(f"Compute device: {device}")
|
||||
|
||||
# Move FP4_BOUNDARIES to device once
|
||||
global FP4_BOUNDARIES
|
||||
FP4_BOUNDARIES = FP4_BOUNDARIES.to(device)
|
||||
|
||||
print("Building conversion plan...")
|
||||
plan = build_plan(src)
|
||||
n_pairs = len(plan["expert_pair_groups"])
|
||||
n_solo = len(plan["solo_quantize"])
|
||||
n_preserve = len(plan["preserve"])
|
||||
n_scales = len(plan["scale_companions"])
|
||||
print(f" Expert pair groups (joint scale_2): {n_pairs:,}")
|
||||
print(f" Solo quantize tensors: {n_solo:,}")
|
||||
print(f" Preserved tensors: {n_preserve:,}")
|
||||
print(f" Scale companions consumed: {n_scales:,}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDry run — exiting before any writes.")
|
||||
return
|
||||
|
||||
dst.mkdir(parents=True, exist_ok=True)
|
||||
cache = ShardCache(src, max_open=8)
|
||||
writer = ShardedSafetensorsWriter(dst, max_shard_bytes=int(args.shard_size_gb * 1024**3))
|
||||
|
||||
weight_map = plan["weight_map"]
|
||||
t_start = time.time()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Preserved tensors — copy unchanged
|
||||
# ------------------------------------------------------------------
|
||||
for name in tqdm(plan["preserve"], desc="Preserve", unit="tensor"):
|
||||
t = cache.get(weight_map[name]).get_tensor(name)
|
||||
writer.add(name, t)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Expert pairs — joint scale_2 across (gate_proj, up_proj)
|
||||
# ------------------------------------------------------------------
|
||||
for base, name_w1, name_w3 in tqdm(plan["expert_pair_groups"], desc="Expert pairs", unit="pair"):
|
||||
w1_fp8, s1 = load_weight_and_scale(cache, weight_map, name_w1)
|
||||
w3_fp8, s3 = load_weight_and_scale(cache, weight_map, name_w3)
|
||||
|
||||
with torch.no_grad():
|
||||
w1 = dequant_fp8_to_fp32(w1_fp8.to(device), s1.to(device)) if s1 is not None else w1_fp8.float().to(device)
|
||||
w3 = dequant_fp8_to_fp32(w3_fp8.to(device), s3.to(device)) if s3 is not None else w3_fp8.float().to(device)
|
||||
|
||||
scale_2 = compute_global_scale(w1, w3)
|
||||
|
||||
packed1, blk1 = quantize_to_nvfp4(w1, scale_2)
|
||||
packed3, blk3 = quantize_to_nvfp4(w3, scale_2)
|
||||
|
||||
writer.add(name_w1, packed1)
|
||||
writer.add(name_w1.replace(".weight", ".weight_scale"), blk1)
|
||||
writer.add(name_w1.replace(".weight", ".weight_scale_2"), scale_2)
|
||||
|
||||
writer.add(name_w3, packed3)
|
||||
writer.add(name_w3.replace(".weight", ".weight_scale"), blk3)
|
||||
writer.add(name_w3.replace(".weight", ".weight_scale_2"), scale_2)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Solo quantize tensors — independent scale_2 per tensor
|
||||
# ------------------------------------------------------------------
|
||||
for name in tqdm(plan["solo_quantize"], desc="Solo quantize", unit="tensor"):
|
||||
w_fp8, s = load_weight_and_scale(cache, weight_map, name)
|
||||
with torch.no_grad():
|
||||
if s is not None:
|
||||
w = dequant_fp8_to_fp32(w_fp8.to(device), s.to(device))
|
||||
else:
|
||||
# Already non-FP8 (e.g. BF16), just upcast
|
||||
w = w_fp8.float().to(device)
|
||||
|
||||
scale_2 = compute_global_scale(w)
|
||||
packed, blk = quantize_to_nvfp4(w, scale_2)
|
||||
writer.add(name, packed)
|
||||
writer.add(name.replace(".weight", ".weight_scale"), blk)
|
||||
writer.add(name.replace(".weight", ".weight_scale_2"), scale_2)
|
||||
|
||||
# Finalize shards & index
|
||||
final_weight_map = writer.close()
|
||||
cache.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Write model.safetensors.index.json
|
||||
# ------------------------------------------------------------------
|
||||
total_size = sum(
|
||||
(dst / fn).stat().st_size for fn in set(final_weight_map.values())
|
||||
)
|
||||
new_index = {
|
||||
"metadata": {"total_size": total_size},
|
||||
"weight_map": final_weight_map,
|
||||
}
|
||||
with open(dst / "model.safetensors.index.json", "w") as f:
|
||||
json.dump(new_index, f, indent=2)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Copy non-tensor files (config, tokenizer, etc.)
|
||||
# ------------------------------------------------------------------
|
||||
for fname in src.iterdir():
|
||||
if fname.is_dir():
|
||||
# encoding/, inference/, assets/ — copy whole tree
|
||||
dst_sub = dst / fname.name
|
||||
if not dst_sub.exists():
|
||||
shutil.copytree(fname, dst_sub)
|
||||
continue
|
||||
if fname.suffix == ".safetensors":
|
||||
continue
|
||||
if fname.name == "model.safetensors.index.json":
|
||||
continue
|
||||
shutil.copy2(fname, dst / fname.name)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 6. Patch config.json with quantization metadata so loaders know
|
||||
# ------------------------------------------------------------------
|
||||
cfg_path = dst / "config.json"
|
||||
if cfg_path.exists():
|
||||
with open(cfg_path) as f:
|
||||
cfg = json.load(f)
|
||||
cfg["quantization_config"] = {
|
||||
"quant_method": "compressed-tensors",
|
||||
"format": "nvfp4-pack-quantized",
|
||||
"config_groups": {
|
||||
"group_0": {
|
||||
"targets": ["Linear"],
|
||||
"weights": {
|
||||
"num_bits": 4,
|
||||
"type": "float",
|
||||
"strategy": "tensor_group",
|
||||
"group_size": 16,
|
||||
"symmetric": True,
|
||||
},
|
||||
}
|
||||
},
|
||||
"ignore": PRESERVE_REGEXES,
|
||||
}
|
||||
with open(cfg_path, "w") as f:
|
||||
json.dump(cfg, f, indent=2)
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
print(f"\nDone in {elapsed/3600:.2f}h")
|
||||
print(f"Output: {dst}")
|
||||
print(f"Total size: {total_size/1024**3:.1f} GB across {len(set(final_weight_map.values()))} shards")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
index.yaml
Normal file
3
index.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: v1
|
||||
entries: {}
|
||||
generated: "2026-04-17T19:18:02.693243217Z"
|
||||
173
inspect_model.py
173
inspect_model.py
@@ -1,173 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Inspect a DeepSeek FP8 model directory and report on tensor structure.
|
||||
|
||||
Usage: python inspect_model.py <model_dir>
|
||||
|
||||
Prints:
|
||||
- Total tensor count and dtype histogram
|
||||
- Sample of tensor names by category (lm_head, embeddings, attention, MoE experts, norms, etc.)
|
||||
- FP8 block scaling structure (block size detection)
|
||||
- MoE expert layer count and routing structure
|
||||
- Any "unusual" tensors that need manual classification
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from safetensors import safe_open
|
||||
|
||||
|
||||
# Patterns we'd preserve (skip quantization on)
|
||||
PRESERVE_PATTERNS = [
|
||||
(re.compile(r".*lm_head.*"), "lm_head"),
|
||||
(re.compile(r".*embed_tokens.*"), "embeddings"),
|
||||
(re.compile(r".*\.mlp\.gate(\.weight)?$"), "moe_router_gate"),
|
||||
(re.compile(r".*norm.*"), "normalization"),
|
||||
(re.compile(r".*indexer.*"), "attention_indexer"), # V3.2 DSA / V4 CSA?
|
||||
(re.compile(r".*hyper_conn.*"), "mhc_hyper_conn"), # V4 mHC
|
||||
(re.compile(r".*mhc.*"), "mhc_other"),
|
||||
(re.compile(r".*scoring.*"), "scoring"),
|
||||
]
|
||||
|
||||
# Patterns for MoE expert weights (these are what we WILL quantize)
|
||||
EXPERT_PATTERNS = [
|
||||
(re.compile(r".*experts\.\d+\.gate_proj.*"), "expert_gate_proj"),
|
||||
(re.compile(r".*experts\.\d+\.up_proj.*"), "expert_up_proj"),
|
||||
(re.compile(r".*experts\.\d+\.down_proj.*"), "expert_down_proj"),
|
||||
(re.compile(r".*shared_experts?\.gate_proj.*"), "shared_gate_proj"),
|
||||
(re.compile(r".*shared_experts?\.up_proj.*"), "shared_up_proj"),
|
||||
(re.compile(r".*shared_experts?\.down_proj.*"), "shared_down_proj"),
|
||||
]
|
||||
|
||||
|
||||
def categorize(name):
|
||||
for pat, cat in PRESERVE_PATTERNS:
|
||||
if pat.match(name):
|
||||
return ("preserve", cat)
|
||||
for pat, cat in EXPERT_PATTERNS:
|
||||
if pat.match(name):
|
||||
return ("quantize_expert", cat)
|
||||
if name.endswith(".weight_scale_inv"):
|
||||
return ("scale_metadata", "fp8_block_scale")
|
||||
if name.endswith(".weight"):
|
||||
return ("quantize_other", "linear_weight")
|
||||
return ("other", "uncategorized")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("model_dir")
|
||||
ap.add_argument("--show-samples", type=int, default=5,
|
||||
help="How many sample names to show per category")
|
||||
args = ap.parse_args()
|
||||
|
||||
model_dir = Path(args.model_dir)
|
||||
index_path = model_dir / "model.safetensors.index.json"
|
||||
if not index_path.exists():
|
||||
print(f"ERROR: {index_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(index_path) as f:
|
||||
index = json.load(f)
|
||||
weight_map = index["weight_map"]
|
||||
total_size = index.get("metadata", {}).get("total_size")
|
||||
|
||||
print(f"=== {model_dir} ===")
|
||||
print(f"Total tensors: {len(weight_map):,}")
|
||||
print(f"Total shards: {len(set(weight_map.values()))}")
|
||||
if total_size:
|
||||
print(f"Reported size: {total_size / 1024**3:.1f} GB")
|
||||
print()
|
||||
|
||||
# Categorize names (cheap, no tensor loading)
|
||||
categories = defaultdict(list)
|
||||
for name in weight_map:
|
||||
kind, cat = categorize(name)
|
||||
categories[(kind, cat)].append(name)
|
||||
|
||||
print("=== Tensor categorization ===")
|
||||
for (kind, cat), names in sorted(categories.items()):
|
||||
print(f" [{kind:18s}] {cat:25s} count={len(names):,}")
|
||||
for n in names[: args.show_samples]:
|
||||
print(f" {n}")
|
||||
if len(names) > args.show_samples:
|
||||
print(f" ... and {len(names) - args.show_samples} more")
|
||||
print()
|
||||
|
||||
# Inspect dtypes and FP8 block scaling on a sample shard
|
||||
sample_shard = model_dir / sorted(set(weight_map.values()))[0]
|
||||
print(f"=== Sampling dtypes from {sample_shard.name} ===")
|
||||
dtype_hist = Counter()
|
||||
fp8_block_sizes = Counter()
|
||||
weight_with_scale = []
|
||||
|
||||
with safe_open(sample_shard, framework="pt") as f:
|
||||
names_in_shard = list(f.keys())
|
||||
for name in names_in_shard:
|
||||
t = f.get_tensor(name)
|
||||
dtype_hist[str(t.dtype)] += 1
|
||||
|
||||
# Check for FP8 weight + scale_inv pair
|
||||
if name.endswith(".weight") and t.dtype.is_floating_point and t.element_size() == 1:
|
||||
scale_name = name.replace(".weight", ".weight_scale_inv")
|
||||
if scale_name in names_in_shard:
|
||||
scale_t = f.get_tensor(scale_name)
|
||||
bm = t.shape[0] / scale_t.shape[0] if scale_t.dim() == 2 else None
|
||||
bn = t.shape[1] / scale_t.shape[1] if scale_t.dim() == 2 and t.dim() == 2 else None
|
||||
fp8_block_sizes[(bm, bn)] += 1
|
||||
if len(weight_with_scale) < 3:
|
||||
weight_with_scale.append((name, t.shape, t.dtype, scale_t.shape, scale_t.dtype))
|
||||
|
||||
print(" Dtype histogram (this shard only):")
|
||||
for d, c in dtype_hist.most_common():
|
||||
print(f" {d:20s} {c:,}")
|
||||
|
||||
print()
|
||||
print(" FP8 block-scale dimensions detected:")
|
||||
for (bm, bn), c in fp8_block_sizes.most_common():
|
||||
print(f" block_size = ({bm}, {bn}) count={c}")
|
||||
|
||||
print()
|
||||
print(" Sample FP8 weight + scale_inv pairs:")
|
||||
for name, wshape, wdt, sshape, sdt in weight_with_scale:
|
||||
print(f" {name}")
|
||||
print(f" weight: shape={tuple(wshape)} dtype={wdt}")
|
||||
print(f" scale: shape={tuple(sshape)} dtype={sdt}")
|
||||
|
||||
# MoE structure summary
|
||||
print()
|
||||
print("=== MoE structure summary ===")
|
||||
layer_experts = defaultdict(set)
|
||||
for name in weight_map:
|
||||
m = re.match(r".*layers\.(\d+)\..*experts\.(\d+)\..*", name)
|
||||
if m:
|
||||
layer_experts[int(m.group(1))].add(int(m.group(2)))
|
||||
if layer_experts:
|
||||
layer_count = len(layer_experts)
|
||||
expert_counts = [len(v) for v in layer_experts.values()]
|
||||
print(f" Layers with MoE experts: {layer_count}")
|
||||
print(f" Experts per layer: min={min(expert_counts)} max={max(expert_counts)}")
|
||||
print(f" Sample layer 0 experts: {sorted(list(layer_experts[min(layer_experts)]))[:5]}...")
|
||||
else:
|
||||
print(" No '.experts.N.' pattern found — MoE structure may use different naming.")
|
||||
|
||||
# Flag uncategorized for human review
|
||||
print()
|
||||
print("=== Uncategorized tensors (review these manually) ===")
|
||||
uncat = categories.get(("other", "uncategorized"), [])
|
||||
if uncat:
|
||||
print(f" {len(uncat):,} tensors:")
|
||||
for n in uncat[:20]:
|
||||
print(f" {n}")
|
||||
if len(uncat) > 20:
|
||||
print(f" ... and {len(uncat) - 20} more")
|
||||
else:
|
||||
print(" None — every tensor matched a known pattern.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
59
memory/2026-05-08-dataset-cache-fix.md
Normal file
59
memory/2026-05-08-dataset-cache-fix.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Session: 2026-05-08 16:49:00 UTC
|
||||
|
||||
- **Session Key**: agent:main:main
|
||||
- **Session ID**: 9143ad68-d6c2-4b80-8dca-6aaa80b37c8b
|
||||
- **Source**: webchat
|
||||
|
||||
## Conversation Summary
|
||||
|
||||
assistant: Token saved. Now pre-download the dataset:
|
||||
assistant: Access denied even with the token. The token might not have accepted the terms for this specific dataset. Let me check if the modelopt script actually needs this dataset or if we can just use `--dataset` to override:
|
||||
assistant: You're right, my bad. Let me persist the token properly and pre-download the dataset.Token saved. Now pre-download the dataset:Access denied even with the token. The token might not have accepted the terms for this specific dataset. Let me check if the modelopt script actually needs this dataset or if we can just use `--dataset` to override:
|
||||
user: System: [2026-05-08 15:38:05 UTC] Exec completed (marine-n, code 0) :: Fetching 204 files: 0%| | 0/204 [00:00<?, ?it/s] Fetching 204 files: 0%| | 0/204 [00:00<?, ?it/s] Error: Access denied. This repository requires approval. [90mSet HF_DEBUG=1 as en…
|
||||
|
||||
Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Fri 2026-05-08 15:39 UTC] all you have to do is say so and ill get you a new token that has access, jeeze. try this one `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`
|
||||
assistant: All 204 files downloaded. Token's persisted, dataset is cached. Now fire it up:
|
||||
assistant: Dataset pre-downloaded, token persisted, screen running. ~54 min for model load then calibration should find the dataset in cache. We're synced. 🔧
|
||||
user: System: [2026-05-08 15:39:53 UTC] Exec completed (salty-re, code 0) :: /204 [00:36<00:01, 5.63it/s] Fetching 204 files: 98%|█████████▊| 199/204 [00:37<00:00, 5.23it/s] Fetching 204 files: 99%|█████████▉| 202/204 [00:37<00:00, 6.21it/s] Fetching 204 f…
|
||||
|
||||
Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Fri 2026-05-08 16:44 UTC] Ok looks like gpu memory cleared. Not sure if it was successful or another fail
|
||||
assistant: Same error. The `datasets` library's `load_dataset(streaming=True, ...)` isn't using the cached token or the pre-downloaded files — it's trying to resolve the dataset script via the API and failing before it even gets to the cache. This version of `datasets` (4.8.5) has a bug with gated dataset resolution in streaming mode.
|
||||
|
||||
Enough of this — let me patch modelopt's `dataset_utils.py` to pass the token explicitly:
|
||||
assistant: OOM on grep — the venv is huge. Let me be more targeted:
|
||||
assistant: Python keeps getting OOM'd. The model is still in RAM from the failed run. Let me try a different approach — just use `sed` to patch the file:
|
||||
assistant: Everything is getting OOM killed — the model is still resident. Let me clear it first:
|
||||
assistant: Even ssh is getting OOM'd. The system is completely swamped. The screen session's python process probably still has the 3TB model mapped in RAM.
|
||||
assistant: SSH itself is getting OOM killed. The B200 is completely out of memory. Mike, the box is thrashing — the failed quant process is still eating all the RAM. You'll need to kill the screen session or reboot the machine. Can you hop on and `screen -X -S nvfp4-full quit` or just `reboot`?
|
||||
user: System: [2026-05-08 16:44:46 UTC] Exec failed (briny-ba, signal SIGKILL)
|
||||
System: [2026-05-08 16:44:59 UTC] Exec failed (vivid-ot, signal SIGKILL)
|
||||
System: [2026-05-08 16:45:13 UTC] Exec failed (quick-bl, signal SIGKILL)
|
||||
System: [2026-05-08 16:45:34 UTC] Exec failed (nimble-t, signal SIGKILL)
|
||||
System: [2026-05-08 16:45:51 UTC] Exec failed (tidy-ree, signal SIGKILL)
|
||||
|
||||
Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Fri 2026-05-08 16:47 UTC] you're trying to loginto `b200-nodepool-133105fd6865` ?
|
||||
assistant: No, I'm using `45.76.247.107` from the `.env`. Is that the right address?
|
||||
28
memory/2026-05-08-deepseek-v4-quant-resume.md
Normal file
28
memory/2026-05-08-deepseek-v4-quant-resume.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# 2026-05-08 DeepSeek V4 Pro NVFP4 — Session Resume
|
||||
|
||||
## Status: Ready to fire (awaiting Mike's go)
|
||||
|
||||
### What changed from last session
|
||||
- We are doing **FULL NVFP4 quantization** (not experts-only). Upcasting mixed-precision to pure BF16, then full quant to NVFP4.
|
||||
- Calibration size is **128** (not 256).
|
||||
- Source model: `/root/nvidia-meeting/DeepSeek-V4-Pro-BF16` (dequantized from FP8)
|
||||
- Previous run failed at calibration: `nvidia/Nemotron-Post-Training-Dataset-v2` is gated, HF token wasn't being picked up by modelopt's `load_dataset()` call
|
||||
- **Root cause:** modelopt's `dataset_utils.py:303` calls `load_dataset(streaming=True, **config, split=s)` with NO token parameter — relies on env var or `~/.cache/huggingface/token`
|
||||
|
||||
### Token fixes applied (this session)
|
||||
- New HF token: `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` (verified working — write permission, dataset access confirmed)
|
||||
- Token planted in: `~/.bashrc`, `~/.profile`, `/etc/environment`, `~/.huggingface/token`, `~/.cache/huggingface/token` (via `hf auth login`)
|
||||
- Script updated to export both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` and echo the token at runtime
|
||||
|
||||
### Script
|
||||
- `/root/nvidia-meeting/deepseek-v4-quant/scripts/model_opt_nvfp4_full.py`
|
||||
- Config: `nvfp4`, `tp 8`, `calib 128`, `--use_seq_device_map`, `--gpu_max_mem_percentage 0.7`
|
||||
- Output: `saved_models_DeepSeek-V4-Pro-BF16_nvfp4_kv_fp8_cast` (currently empty — clean start)
|
||||
|
||||
### Repo sync
|
||||
- Local and B200 both on `modelopt-nvfp4` branch, commit `075da67`
|
||||
|
||||
### B200 Node
|
||||
- IP: 45.76.247.107, user: root, pass: 6)Jr)B@dcX[mN?dx
|
||||
- GPUs: all 8× B200 idle (0 MiB used)
|
||||
- No screen sessions running
|
||||
63
memory/2026-05-09-deepseek-v4-quant.md
Normal file
63
memory/2026-05-09-deepseek-v4-quant.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# 2026-05-09 DeepSeek V4 Pro NVFP4 Quantization — Run 4
|
||||
|
||||
## Status: 🔄 RUNNING (started ~08:05 UTC May 9, expected completion ~15:00 UTC)
|
||||
|
||||
### Cost Context
|
||||
$23/hr per B200 node. Each run is ~7 hours = ~$161. Don't waste runs.
|
||||
If Mike asks about this project, he's tracking the spend.
|
||||
|
||||
### Where to check status
|
||||
- **SSH:** `ssh root@45.76.247.107` (password: `6)Jr)B@dcX[mN?dx`)
|
||||
- **Screen:** `screen -r quantize`
|
||||
- **Log:** `tail -f /root/nvidia-meeting/quantize_nvfp4.log`
|
||||
- **GPU:** `nvidia-smi`
|
||||
- **Calibration progress:** grep for `XXX/128` in the log
|
||||
|
||||
### What's running
|
||||
- **Script:** `/root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py`
|
||||
- **Commit:** `f9bbef8`
|
||||
- **Source model:** `/root/nvidia-meeting/DeepSeek-V4-Pro-BF16`
|
||||
- **Export dir:** `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4`
|
||||
- **Config:** nvfp4, 128 calib samples, calib_seq=512, kv_fp8_cast, gpu_mem_pct=0.7
|
||||
|
||||
### Run Progression (DO NOT GO BACKWARDS)
|
||||
|
||||
| Run | Result | Commit | Root Cause | Fix Applied |
|
||||
|-----|--------|--------|------------|-------------|
|
||||
| 1 | ❌ batch probing shape mismatch | shell wrapper (pre-repo) | FP8 source + finegrained_fp8 wraps MLA wrong | Use BF16 source |
|
||||
| 2 | ❌ export crash after 128/128 calib | shell wrapper (pre-repo) | stale GPU amax → cudaErrorIllegalAddress in get_activation_scaling_factor | CPU patches + snapshot |
|
||||
| 3 | ❌ model loading OOM | `3907838` | AutoModelForCausalLM.from_pretrained torch.cat on experts (31.5GB, 25.9GB free) | Use modelopt get_model() |
|
||||
| 4 | ❌ quantize config crash | `86dd8df` | `mtq.KV_QUANT_CFG_CHOICES` doesn't exist — it's in `hf_ptq` | Import from hf_ptq |
|
||||
| 5 | ❌ same as Run 4 | `f9bbef8` | same import bug | same fix, proper sync |
|
||||
| 6 | ❌ dataloader crash | `6c1bff6` | make_calib_dataloader missing args (dataset, calib_with_images, etc.) | Add all required args |
|
||||
| 7 | ❌ same root cause | `25b4d8d` | dataset=None, len() on None | Provide actual dataset list |
|
||||
| 9 | ❌ TypeError | `a300302` | calib_size stays as string "128" — __main__ post-parse conversions skipped | Apply same conversions after parse_args() |
|
||||
| 10 | ❌ export crash (calib ✅) | `5a72da7` | get_weight_scaling_factor reads stale GPU weight → cudaErrorIllegalAddress | Patch _export_quantized_weight to force weight to CPU at entry |
|
||||
| 11 | 🔄 running | `07cd50e` | TBD | 8 patches covering full export chain |
|
||||
|
||||
### What's in the current script (f9bbef8)
|
||||
1. **Model loading** — `get_model()` from modelopt (handles max_memory, avoids OOM)
|
||||
2. **load_calib_amax patch** — after amax is written to GPU, immediately moves to CPU
|
||||
3. **export_amax patch** — safety net, moves any remaining GPU amax to CPU before reading
|
||||
4. **get_activation_scaling_factor patch** — clamp instead of assert, CPU move
|
||||
5. **snapshot_amax_to_cpu()** — walks all quantizers after calibration, saves to disk (~50MB)
|
||||
6. **force_all_amax_to_cpu()** — moves _pre_quant_scale, _global_amax too
|
||||
7. **--export-only** — retry export from saved state
|
||||
8. **--validate-only** — check saved state integrity
|
||||
|
||||
Only divergence from modelopt example: `get_model()` instead of raw `AutoModelForCausalLM.from_pretrained`.
|
||||
|
||||
### If Run 4 fails
|
||||
Check the log. Identify crash point. Add to table. Fix. Don't repeat earlier mistakes.
|
||||
|
||||
### Repo State
|
||||
- **Local:** `/home/openclaw/dev/deepseek-v4-quant` (branch `modelopt-nvfp4`)
|
||||
- **Remote:** `https://sweetapi.com/biondizzle/deepseek-v4-quant.git`
|
||||
- **On B200:** `/root/nvidia-meeting/deepseek-v4-quant` (synced to same commit)
|
||||
- **Must run from:** `/root/nvidia-meeting/modelopt-repo/examples/llm_ptq` (relative imports)
|
||||
|
||||
### B200 Node Details
|
||||
- 8× B200, 2.7TB RAM, 13TB NVMe
|
||||
- venv: `/root/nvidia-meeting/venv`
|
||||
- modelopt: `0.45.0.dev64+g579fc6c31` (from git)
|
||||
- transformers: `5.8.0.dev0` (from git, required for V4)
|
||||
24
memory/2026-05-10-session-reset.md
Normal file
24
memory/2026-05-10-session-reset.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Session: 2026-05-10 23:05:46 UTC
|
||||
|
||||
- **Session Key**: agent:main:main
|
||||
- **Session ID**: 2187c0aa-d111-453a-a1ff-ad229ea0d352
|
||||
- **Source**: webchat
|
||||
|
||||
## Conversation Summary
|
||||
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 23:05 UTC] The read me and memory has been updated so we can pick this back up? if it has, im clearing this session because openclaw is getting fragile with this single long running session going
|
||||
assistant: Let me check the current state of things.
|
||||
assistant: Yeah, you're good to clear it. Everything's captured:
|
||||
|
||||
- **MEMORY.md** — High-level project summary, node details, key gaps, repo state
|
||||
- **memory/2026-05-10.md** — Full debug session writeup: S11→S14 progress, all bugs found, current status, architecture decisions, file paths
|
||||
|
||||
On fresh session I'll read those and pick right back up at the NVFP4→FP8 conversion with the E2M1 sign fix that hasn't been tested yet. Go ahead and nuke it.
|
||||
69
memory/2026-05-10-vllm-cleanup.md
Normal file
69
memory/2026-05-10-vllm-cleanup.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Session: 2026-05-10 16:21:39 UTC
|
||||
|
||||
- **Session Key**: agent:main:main
|
||||
- **Session ID**: f727c6b7-5bdb-43bd-a7f1-3262779abecf
|
||||
- **Source**: webchat
|
||||
|
||||
## Conversation Summary
|
||||
|
||||
assistant: Good context — we're breaking new ground here. Let me get the current container status, then sync everything up.
|
||||
assistant: OK so the latest run is still hitting `KeyError` — `fused_wqa_wkv.weight` isn't found because my `params_dict[name]` skip only covers the "else" branch, not all code paths. But that's fine — the current state is still being debugged. Let me now sync the repo with all our work and update the README.
|
||||
assistant: Good context — we're breaking new ground here. Let me get the current container status, then sync everything up.OK so the latest run is still hitting `KeyError` — `fused_wqa_wkv.weight` isn't found because my `params_dict[name]` skip only covers the "else" branch, not all code paths. But that's fine — the current state is still being debuged. Let me now sync the repo with all our work and update the README.
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 16:09 UTC] I dont mean to interrupt. Just so I can make sure we're on the right path, don't focus on solving the current bug or re running the script. Lets just get everything in a good clean documented state like i mentioned earlier and then we can go in with fresh context once I'm caught up and can get a second pair of eyes on it. Cool?
|
||||
assistant: Got it. Clean up and document. No more debugging. Let me sync everything and update the README.
|
||||
assistant: Good. Now let me write the updated README:
|
||||
assistant: Now update `serve_vllm.py` to add a note about mega_moe being disabled:
|
||||
assistant: Good. Now commit and push:
|
||||
assistant: Now make sure the B200 has the latest synced files too:
|
||||
assistant: Good. Everything is synced. Let me also save a memory file for continuity:
|
||||
assistant: All synced. Here's the summary, Mike:
|
||||
|
||||
**Repo:** Everything committed and pushed to `modelopt-nvfp4`. B200 node also pulled latest. Files updated:
|
||||
- `README.md` — Full vLLM serving run history (S1→S10), open issues, vLLM bug list, updated file layout
|
||||
- `patches/deepseek_v4.py` — Patched vllm source with all our name mappings, E2M1 unpacking, skip patterns, resilient loading
|
||||
- `patches/patch_vllm_weights.py` — Updated but marked as legacy (doesn't work with worker processes)
|
||||
- `docker-compose.yml` — No mega_moe, copies patched source file at startup
|
||||
- `scripts/serve_vllm.py` — Warning comment on mega_moe flag
|
||||
|
||||
**Where we left off:** We're 10 iterations deep into the vLLM weight loading. The core unsolved problem is `MergedColumnLinear` + NVFP4 — vllm's `ModelOptNvFp4Config` only creates proper NVFP4 params for `Linear` layers, not the stacked `MergedColumnLinear` used for fused attention (`fused_wqa_wkv`, `fused_wkv_wgate`) and shared expert (`gate_up_proj`) projections. Our workaround (unpack uint8→bf16, skip scales) gets past the weight loading but loses the calibration-optimized scale factors for those layers.
|
||||
|
||||
Fresh eyes might see a cleaner path. The `ModelOptNvFp4Config.create_weights()` method is the right place to fix it properly — it needs to handle `MergedColumnLinear` and create `PackedColumnParameter` + stacked scale params.
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 16:16 UTC] Perfect, ok. And it looks like in here you have a full history of config.json tweaks you made so we have a record of those incase they're causing an issue. Is that correct?
|
||||
assistant: Yeah — the config.json patches are all documented in the README under the "Model Config Patches (post-export)" table. That covers the 4 fields we patched:
|
||||
|
||||
1. `compress_ratios` — copied from BF16 source model (62 items)
|
||||
2. `quantization_config.scale_fmt` — added `"ue8m0"`
|
||||
3. `rope_parameters` — flattened from nested dict (transformers 5.8.0 format) to flat dict
|
||||
4. `rope_scaling` — same flattening
|
||||
|
||||
The table has the original modelopt value, what vllm requires, what we patched it to, and why modelopt doesn't export it. So if any of those are causing serving issues, we can trace it back.
|
||||
|
||||
One thing that's NOT in the repo though — the actual patched `config.json` file lives on the B200 at `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4/config.json`. It's too big for git (881GB model dir). Want me to grab a copy of just the config.json and stash it in the repo for reference?
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 16:20 UTC] Im going to give you fresh context, you have everything you need to start just from the memory file you saved on yourself?
|
||||
assistant: Yeah — between `MEMORY.md`, `memory/2026-05-10.md`, and the repo (README has the full run history, bug list, and config patch table), I've got everything. B200 creds are in the `.env`, patch logic is in `patches/deepseek_v4.py`, docker setup is in `docker-compose.yml`.
|
||||
|
||||
Fire away with the fresh context.
|
||||
94
memory/2026-05-10-vllm-modelopt-patch.md
Normal file
94
memory/2026-05-10-vllm-modelopt-patch.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# Session: 2026-05-10 13:12:29 UTC
|
||||
|
||||
- **Session Key**: agent:main:main
|
||||
- **Session ID**: 8eeafe17-997c-452c-97fe-38e25742c5eb
|
||||
- **Source**: webchat
|
||||
|
||||
## Conversation Summary
|
||||
|
||||
assistant: Now let me see what vllm's mapper expects. Let me dump the full mapper:Now I can see the mapping. vllm expects names like:
|
||||
- `layers.0.attn.compressor.gate_proj...` → `model.layers.0.atn.mla_attn.compressor.gate_proj...`
|
||||
- `layers.0.shared_experts.w2` → `model.layers.0.shared_experts.down_proj`
|
||||
|
||||
Our modelopt export has:
|
||||
- `model.layers.0.self_attn.compressor.gate_proj...` (has `self_atn` not `attn`)
|
||||
- `model.layers.0.mlp.shared_experts.down_proj` (already uses `down_proj`)
|
||||
|
||||
The prefix mapper converts `layers.` → `model.layers.` and then the substr mapper converts `.attn.compressor.` → `.attn.mla_attn.compressor.`. But our weights already have `model.layers.` prefix AND use `self_attn` instead of `atn`.
|
||||
|
||||
The issue: modelopt exports with `self_attn` but vllm's mapper expects `atn` (which it then maps to `attn.mla_attn`). Let me also check what other name mismatches exist:Now I have the full picture. The main mapping issue is `self_atn` → `attn.mla_attn`. But there are also quantization suffixes (`input_scale`, `weight_scale`, `weight_scale_2`) that modelopt adds. vllm's modelopt_fp4 quantizer should handle those natively — the `WeightsMapper` just needs to map the base names correctly.
|
||||
|
||||
Let me write a mountable patch script that fixes the mapper at container startup:Now I need to figure out how to inject this patch into the container. The easiest way is to mount the patches dir and add it as an entrypoint wrapper. Let me update docker-compose:YAML escape issue. Let me fix the entrypoint:YAML's being a pain with indentation. Let me just write it properly:YAML validates. Let me start it:**It's past the weight mapper crash!** The patch is working. I can see:
|
||||
- `Detected ModelOpt NVFP4 checkpoint` ✅
|
||||
- `Using max model len 1048576` ✅
|
||||
- `DeepSeekV4MTPModel` architecture resolved ✅
|
||||
- Now it's initializing the engine...
|
||||
|
||||
This is loading the 881GB model across 8 GPUs. This will take a while. Let me check back in a few minutes:
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 09:30 UTC] fuck it. do #2 and see where ya get. If ya dont get anywhere, then ya need to look deeper into model opt and we have to patch model opt, or see if there's a seeting we missed that does that
|
||||
assistant: Already on it — I patched the weight mapper and restarted the container. It got past the `UnboundLocalError` crash. Let me check where it's at now:
|
||||
assistant: SSH connection is timing out. The machine might be under heavy load from model loading (881GB across 8 GPUs). Let me retry:
|
||||
assistant: The execs keep getting aborted. Let me try a different approach:
|
||||
user: System: [2026-05-10 09:31:01 UTC] Exec completed (calm-ree, code 0) :: INFO 05-10 09:30:59 [nixl_utils.py:20] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. INFO 05-10 09:30:59 [nixl_utils.py:32] NIXL …
|
||||
System: [2026-05-10 09:31:19 UTC] Exec completed (calm-dai, code 0) :: __', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subcl…
|
||||
System: [2026-05-10 09:31:40 UTC] Exec completed (swift-br, code 0) :: === orig_to_new_substr === .attn.compressor. -> .attn.mla_attn.compressor. .shared_experts.w2 -> .shared_experts.down_proj === orig_to_new_prefix === layers. -> model.layers. embe…
|
||||
System: [2026-05-10 09:34:38 UTC] Exec completed (briny-ri, code 0) :: ver pid=1)[0;0m [33mWARNING[0m [90m05-10 09:34:35[0m [90m[vllm.py:1252][0m Auto-initialization of reasoning token IDs failed. Please check whether your reasoning parser has implem…
|
||||
|
||||
Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 09:35 UTC] you did this in detached mode? If you did ill watch and tell you when we're good
|
||||
assistant: Yeah, it's detached — `docker compose up -d`. The container's already past the weight mapper crash (the patch worked 🎉), so it's in the model loading phase now. 881GB across 8 GPUs takes a hot minute.
|
||||
|
||||
You can tail it with:
|
||||
```
|
||||
docker compose -f /root/nvidia-meeting/docker-compose.yml logs -f
|
||||
```
|
||||
|
||||
What to look for:
|
||||
- **Good sign**: `Loading safetensors checkpoint shards: X/95` progressing
|
||||
- **Bad sign**: Another traceback, `UnboundLocalError` again, or `KeyError`
|
||||
- **Victory**: `Uvicorn running on http://0.0.0.0:8000` or `Application startup complete`
|
||||
|
||||
If it crashes again, save the last 50 lines of logs and I'll dig into whatever's next. The mapper patch covered the `self_attn` → `attn.mla_attn` renaming, but there might be other weight name mismatches I couldn't fully predict without seeing what vllm's internal model actually expects vs what modelopt gave us.
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 10:58 UTC] im in bed. check if it came up or if we neeed more patches
|
||||
assistant: Crashed again. Container exited. Let me get the root cause — the worker error:
|
||||
assistant: Same `UnboundLocalError: name_mapped`. The patch didn't take. Let me check if the patch script actually ran:
|
||||
assistant: The patch ran in the APIServer process (pid=1) but the worker processes (pid=1068-1075) are separate processes spawned later. They don't inherit the monkeypatch! The `WeightsMapper` is rebuilt from scratch inside each worker. I need a different approach — patch the actual file on disk inside the container, or use a volume mount to override it.
|
||||
|
||||
Let me go deeper. The real issue is in `utils.py`'s `_load_module` → `AutoWeightsLoader.load_weights` → `_map_name`. The `name_mapped` is set by the mapper. If the mapper can't map a name, `name_mapped` is never assigned. Let me look at that function:
|
||||
assistant: Wait — the error is at `deepseek_v4.py` line 1579, which is `loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)`. But `name_mapped` isn't in that code. It's in the `deepseek_v2.py` code. Let me check if the DeepSeek V4 model is using the V2 load_weights somehow, or if the error is coming from the `AutoWeightsLoader` internally:
|
||||
assistant: There's ANOTHER `load_weights` in `deepseek_v4.py` at line 1417 — that's NOT the `DeepseekV4ForCausalLM.load_weights` at line 1579. This is a different class (probably `DeepseekV4Model` or a sub-model). Let me see:
|
||||
assistant: There it is. Line 1417: `loaded_params.add(name_mapped)`. If none of the `expert_mapping` entries match (the `continue` on line 1403 skips), `name_mapped` is never set, and it falls through to `loaded_params.add(name_mapped)` which blows up.
|
||||
|
||||
This is the MoE expert weight loader. The expert weights from modelopt have names like `model.layers.0.mlp.experts.0.gate_proj.weight` but this code expects a different naming convention. The `expert_mapping` comes from `get_expert_mapping()`. Let me see what it expects:
|
||||
user: Sender (untrusted metadata):
|
||||
```json
|
||||
{
|
||||
"label": "openclaw-control-ui",
|
||||
"id": "openclaw-control-ui"
|
||||
}
|
||||
```
|
||||
|
||||
[Sun 2026-05-10 13:06 UTC] what happened?
|
||||
198
memory/2026-05-10.md
Normal file
198
memory/2026-05-10.md
Normal file
@@ -0,0 +1,198 @@
|
||||
# 2026-05-10
|
||||
|
||||
## DeepSeek V4 Pro NVFP4 — vLLM Serving Debug Session
|
||||
|
||||
- Quantization completed successfully (Run 11, 881GB NVFP4)
|
||||
- Spent the day debugging vLLM serving of the modelopt NVFP4 checkpoint
|
||||
- Key finding: modelopt and vllm were never integrated for NVFP4 on DeepSeek V4
|
||||
- NVIDIA themselves haven't gotten this far — we're in uncharted territory
|
||||
|
||||
### What we fixed:
|
||||
- Expert weight name mapping (gate_proj→w1, up_proj→w3, down_proj→w2)
|
||||
- mlp→ffn module naming
|
||||
- Attention: self_attn→attn.mla_attn, kv_proj→wkv, etc.
|
||||
- Compressor: kv_proj→wkv, gate_proj→wgate
|
||||
- kv_norm moved from compressor to attention level
|
||||
- Class attribute patching (hf_to_vllm_mapper)
|
||||
- Source file patching (workers are separate processes)
|
||||
- E2M1 FP4→BF16 unpacking for stacked attention params
|
||||
- Skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear
|
||||
|
||||
### What we abandoned:
|
||||
- mega_moe: No NVFP4 kernel exists, format mismatch (16-col vs 32-col blocks)
|
||||
- Runtime monkey-patching: Workers don't inherit patches
|
||||
|
||||
### Open issues (stop point):
|
||||
1. MergedColumnParallelLinear + NVFP4 incompatibility — ModelOptNvFp4Config only handles Linear, not MergedColumn. Weight param is bf16 (should be uint8), no weight_scale registered for stacked params
|
||||
2. Unknown params from modelopt (compressor.position_bias) crash loading
|
||||
3. Current approach (unpack uint8→bf16, skip scales) loses calibration-optimized scales for attention weights
|
||||
|
||||
### Repo state:
|
||||
- All code/patches/docker-compose synced and committed on modelopt-nvfp4 branch
|
||||
- README fully updated with vLLM serving run history, open issues, bug list
|
||||
- B200 node at 45.76.247.107, weights at /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4
|
||||
# 2026-05-10
|
||||
|
||||
## DeepSeek V4 Pro NVFP4 — vLLM Serving Debug Session
|
||||
|
||||
- Quantization completed successfully (Run 11, 881GB NVFP4)
|
||||
- Spent the day debugging vLLM serving of the modelopt NVFP4 checkpoint
|
||||
- Key finding: modelopt and vllm were never integrated for NVFP4 on DeepSeek V4
|
||||
- NVIDIA themselves haven't gotten this far — we're in uncharted territory
|
||||
|
||||
### What we fixed:
|
||||
- Expert weight name mapping (gate_proj→w1, up_proj→w3, down_proj→w2)
|
||||
- mlp→ffn module naming
|
||||
- Attention: self_attn→attn.mla_attn, kv_proj→wkv, etc.
|
||||
- Compressor: kv_proj→wkv, gate_proj→wgate
|
||||
- kv_norm moved from compressor to attention level
|
||||
- Class attribute patching (hf_to_vllm_mapper)
|
||||
- Source file patching (workers are separate processes)
|
||||
- E2M1 FP4→BF16 unpacking for stacked attention params
|
||||
- Skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear
|
||||
|
||||
### What we abandoned:
|
||||
- mega_moe: No NVFP4 kernel exists, format mismatch (16-col vs 32-col blocks)
|
||||
- Runtime monkey-patching: Workers don't inherit patches
|
||||
|
||||
### Open issues (stop point):
|
||||
1. MergedColumnParallelLinear + NVFP4 incompatibility — ModelOptNvFp4Config only handles Linear, not MergedColumn. Weight param is bf16 (should be uint8), no weight_scale registered for stacked params
|
||||
2. Unknown params from modelopt (compressor.position_bias) crash loading
|
||||
3. Current approach (unpack uint8→bf16, skip scales) loses calibration-optimized scales for attention weights
|
||||
|
||||
### Repo state:
|
||||
- All code/patches/docker-compose synced and committed on modelopt-nvfp4 branch
|
||||
- README fully updated with vLLM serving run history, open issues, bug list
|
||||
- B200 node at 45.76.247.107, weights at /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4
|
||||
|
||||
---
|
||||
|
||||
## vLLM NVFP4 Serving — Second Session (16:28–19:35 UTC)
|
||||
|
||||
Mike gave autonomous work instructions. Key directive: use weights AS-IS (NVFP4), do NOT convert to MXFP4. Try FusedMoE first, then if stuck, build a mega_moe NVFP4 kernel from scratch.
|
||||
|
||||
### Major breakthroughs (S11→S14 progress):
|
||||
|
||||
**Key insight: vLLM attention forward bypasses quant_method, uses deepseek_v4_fp8_einsum directly**
|
||||
- The attention code reads `self.wo_a.weight` (expects fp8) and `self.wo_a.weight_scale_inv` directly
|
||||
- NVFP4 uint8 weights are incompatible with this FP8 kernel
|
||||
- Solution: **NVFP4→bf16→FP8 dequantize/requant at load time** for attention layers
|
||||
|
||||
**S12 fixes applied (weight loading now succeeds to 94%):**
|
||||
1. **Substr mapping fix**: Removed `.mla_attn.` prefix from attention projections. The model has `fused_wqa_wkv`, `wq_b`, `wo_a`, `wo_b` at `attn.*` level, not `attn.mla_attn.*`. The stacking code then correctly maps `attn.wq_a` → `attn.fused_wqa_wkv`.
|
||||
2. **Skip patterns fix**: Only skip compressor scale tensors (compressor uses `UnquantizedLinearMethod` with `quant_config=None`). Attention and shared expert scales now correctly load via stacking logic.
|
||||
3. **Suffix mapping fix**: Removed `"head.weight": "lm_head.weight"` which caused `lm_head.weight` → `lm_lm_head.weight` doubling.
|
||||
4. **Resilient loading**: Unknown params (e.g., `compressor.position_bias`) silently skipped.
|
||||
|
||||
**S13 — Weight loading SUCCESS (32 seconds!)**
|
||||
- All 95 safetensors loaded without KeyError
|
||||
- New error: `MergedColumnParallelLinear` has no `weight_scale_inv` (FP8 attribute)
|
||||
|
||||
**S13.5 — o_a_proj discovery:**
|
||||
- modelopt did NOT quantize `o_a_proj` — it's bf16 in the checkpoint (no scales)
|
||||
- But vLLM creates `wo_a` with NVFP4 quant (uint8 weight + scales)
|
||||
- Fix: convert bf16→FP8 directly at load time, set weight_scale_inv
|
||||
|
||||
**S14 — NVFP4→FP8 post-load conversion approach:**
|
||||
- Added `_convert_nvfp4_attention_to_fp8()` and `_convert_nvfp4_module_to_fp8()` methods to `DeepseekV4Model`
|
||||
- Converts all uint8 NVFP4 attention weights (fused_wqa_wkv, wq_b, wo_a, wo_b, gate_up_proj) to FP8 at load time
|
||||
- Steps: unpack E2M1 FP4→bf16, dequantize with block/global scales, requantize to FP8 e4m3, set weight_scale_inv
|
||||
- For o_a_proj (bf16, no scales): convert directly bf16→FP8
|
||||
- For compressor fused_wkv_wgate: stays bf16 (UnquantizedLinearMethod)
|
||||
- For MoE experts: handled natively by ModelOptNvFp4FusedMoE
|
||||
|
||||
**Bug found: E2M1 LUT indexing off-by-one**
|
||||
- FP4 4-bit values are 0-15 (bit 3 = sign, bits 0-2 = magnitude)
|
||||
- LUT has 8 entries (magnitudes 0-7), but code was indexing with full 4-bit value (0-15) → CUDA assert
|
||||
- Fix: mask with `& 0x07` for magnitude index, apply sign from bit 3 separately
|
||||
|
||||
**Bug found: method placement inside Python class**
|
||||
- `_convert_nvfp4_attention_to_fp8` was being placed at top level (0 indent) instead of inside `DeepseekV4Model`
|
||||
- The class actually ends at `finalize_mega_moe_weights()` (line ~1600), followed by top-level `hc_head` function
|
||||
- Had to insert methods BEFORE the `@torch.compile` decorator that marks the class boundary
|
||||
|
||||
**Bug found: logger not available in method**
|
||||
- `logger.info_once()` isn't accessible inside the conversion methods
|
||||
- Replaced with `print(f"...")` for now
|
||||
|
||||
### Current status (as of 19:35 UTC):
|
||||
- Weight loading + NVFP4→FP8 conversion code is in place
|
||||
- Last test was running (loading 880GB checkpoint)
|
||||
- E2M1 sign handling fix applied but NOT YET TESTED
|
||||
- Need to fix `logger` → `print` issue
|
||||
- After load succeeds: FusedMoE expert weight handling needs verification
|
||||
- If FusedMoE fails: need to build mega_moe NVFP4 kernel
|
||||
|
||||
### Key files on B200 node:
|
||||
- Patch: `/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py`
|
||||
- Docker: `docker compose up -d` (TP=8, no mega_moe, FLASHINFER_TRTLLM attn)
|
||||
- Weights: `/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4/`
|
||||
|
||||
### Architecture decisions:
|
||||
- NVFP4→FP8 for attention/shared_experts (requant, preserves FP8 kernel compat)
|
||||
- BF16 for compressor (UnquantizedLinearMethod, no quant_config)
|
||||
- Native NVFP4 for MoE experts (ModelOptNvFp4FusedMoE handles it)
|
||||
- UnquantizedLinearMethod as no-op quant_method (attention forward bypasses it anyway)
|
||||
|
||||
---
|
||||
|
||||
## vLLM NVFP4 Serving — Third Session (23:05+ UTC)
|
||||
|
||||
### Current state of the B200 node:
|
||||
- Docker container ran 27 min ago and crashed with `BFloat16 != Float8_e4m3fn`
|
||||
- Uncommitted changes to `patches/deepseek_v4.py` (the _convert_nvfp4_post_load methods)
|
||||
- Repo on `modelopt-nvfp4` branch, last commit `db16be8`
|
||||
|
||||
### Crash analysis (S15 — `BFloat16 != Float8_e4m3fn`):
|
||||
|
||||
Weight loading succeeds (95/95, 330s). Post-load conversion reports: 122 layers → FP8, 183 → BF16. MoE setup runs. Crash during profile_run/_dummy_run.
|
||||
|
||||
**Root cause**: `_convert_nvfp4_post_load` converts `fused_wqa_wkv` to FP8 and sets `quant_method = UnquantizedLinearMethod()`. The attention forward calls `self.fused_wqa_wkv(hidden_states)` which goes through `UnquantizedLinearMethod.forward()` → `F.linear(bf16_input, fp8_weight)` → dtype mismatch.
|
||||
|
||||
**Key insight about the attention forward paths**:
|
||||
- `wo_a`: Attention code reads `self.wo_a.weight` and `self.wo_a.weight_scale_inv` DIRECTLY, passes to `deepseek_v4_fp8_einsum`. This bypasses `quant_method`. FP8 conversion works here.
|
||||
- `fused_wqa_wkv`: Called via `self.fused_wqa_wkv(hidden_states)` → `MergedColumnParallelLinear.forward()` → `quant_method.forward()`. **Cannot be FP8 with UnquantizedLinearMethod**.
|
||||
- `wq_b`, `wo_b`: Called via normal `.forward()`. Need BF16 + UnquantizedLinearMethod.
|
||||
- `compressor.fused_wkv_wgate`: Called via `torch.mm(hidden_states, weight.T, out_dtype=torch.float32)` DIRECTLY. **Needs BF16 weight** — currently uint8 (not in any conversion set!).
|
||||
|
||||
**Critical finding from safetensors**: `o_a_proj.weight` is BF16 (modelopt did NOT quantize it). So `wo_a` weight is already BF16, not NVFP4. The post-load conversion code's `dtype != uint8` check skips it. This means `wo_a.weight` stays BF16 and `wo_a.weight_scale_inv` is never created. When `deepseek_v4_fp8_einsum` tries to read it as FP8 → crash.
|
||||
|
||||
**Wait, but the log says 122 → FP8.** 61 layers × 2 (fused_wqa_wkv + wo_a) = 122. If wo_a.weight is BF16 and gets skipped, only 61 → FP8. The 122 count means wo_a IS being converted somehow. Hypothesis: `ModelOptNvFp4LinearMethod.create_weights()` creates `wo_a.weight` as uint8. When the BF16 checkpoint data is loaded into the uint8 param, the weight_loader might be casting it, or the param might be updated to BF16. Need to verify.
|
||||
|
||||
### Unfixed bugs from S14 (still present):
|
||||
1. E2M1 sign handling fix applied but NOT TESTED
|
||||
2. `logger` → `print` issue in conversion methods
|
||||
|
||||
### Compressor `fused_wkv_wgate` — PENDING CRASH:
|
||||
- NOT in any conversion set (fp8_proj_names, bf16_proj_names, bf16_shared_names)
|
||||
- Weight is uint8 after loading (NVFP4 packed)
|
||||
- Forward uses `torch.mm(hidden_states, weight.T, out_dtype=torch.float32)` directly
|
||||
- uint8 × BF16 would crash with a different error than the current one
|
||||
- Needs BF16 dequantization in post-load conversion
|
||||
|
||||
### Checkpoint key format (verified from safetensors):
|
||||
- `model.layers.0.self_attn.q_a_proj.weight` — uint8
|
||||
- `model.layers.0.self_attn.q_a_proj.weight_scale` — float8_e4m3fn (block scale)
|
||||
- `model.layers.0.self_attn.q_a_proj.weight_scale_2` — float32 (per-tensor)
|
||||
- `model.layers.0.self_attn.q_a_proj.input_scale` — float32
|
||||
- `model.layers.0.self_attn.o_a_proj.weight` — **BF16** (NOT quantized by modelopt)
|
||||
- `model.layers.0.self_attn.o_b_proj.weight` — uint8
|
||||
- `model.layers.0.self_attn.kv_proj.weight` — uint8
|
||||
- `model.layers.0.self_attn.compressor.kv_proj.weight` — uint8
|
||||
- `model.layers.0.self_attn.compressor.gate_proj.weight` — uint8
|
||||
- `model.layers.0.self_attn.compressor.position_bias` — BF16 (unknown param, skipped)
|
||||
- Expert scales: `.weight_scale`, `.weight_scale_2`, `.input_scale` (NOT `.scale`)
|
||||
|
||||
### FusedMoE NVFP4 status:
|
||||
- `ModelOptNvFp4FusedMoE` creates proper uint8 weights + float8_e4m3fn block scales + float32 per-tensor/input scales
|
||||
- `process_weights_after_loading` calls `convert_to_nvfp4_moe_kernel_format` then `make_nvfp4_moe_kernel`
|
||||
- Uses `cutlass_fp4_gemm` via nvfp4 backend
|
||||
- Warning: `w1_weight_scale_2 must match w3_weight_scale_2` — modelopt gives different global scales to w1 and w3, but FusedMoE uses a single w13_weight_scale_2 (takes w1's). Minor accuracy impact.
|
||||
- `expert_dtype: fp4` in config — causes weight mapper to use `.scale` → `.weight_scale` regex, but checkpoint already uses `.weight_scale` directly, so regex is a no-op. Correct behavior.
|
||||
- `scale_fmt: "ue8m0"` in config — used by attention FP8 einsum. Correct for NVFP4.
|
||||
|
||||
### Config verification:
|
||||
- `compress_ratios` ✅ (copied from BF16 source)
|
||||
- `scale_fmt: "ue8m0"` ✅ (added by us)
|
||||
- `rope_parameters` ✅ (flattened)
|
||||
- `expert_dtype: fp4` ✅ (original, correct for weight mapper regex)
|
||||
58
memory/2026-05-11.md
Normal file
58
memory/2026-05-11.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# 2026-05-11 — DeepSeek V4 NVFP4 vLLM Serving: Full End-to-End
|
||||
|
||||
## 🎉 SERVER RUNNING ON PORT 8000
|
||||
|
||||
The vLLM server successfully loads the NVFP4 model and serves API requests on 8× B200.
|
||||
|
||||
### What We Fixed (Session Summary)
|
||||
|
||||
#### 1. DeepGEMM `sf.dim()` Assertion (CRITICAL)
|
||||
- **Error**: `Assertion error layout.hpp:94: sf.dim() == num_groups + 2`
|
||||
- **Cause**: `weight_scale_inv` was 1D per-tensor scale. DeepGEMM expects 2D/3D block-scale tensor from `transform_sf_into_required_layout`.
|
||||
- **Fix**: Use `deepgemm_post_process_fp8_weight_block(wq, ws, quant_block_shape=(128,128), use_e8m0=True)` to produce correct block-scale format. Store result in `weight_scale_inv`.
|
||||
- **Key insight**: The attention runtime reads `self.wo_a.weight_scale_inv` as `b_scale` for the einsum. It MUST be the DeepGEMM-formatted block scale.
|
||||
|
||||
#### 2. Block Scale dtype
|
||||
- **Error**: `Expected float32 or float8_e8m0fnu, got float8_e4m3fn`
|
||||
- **Fix**: Create block scale as `dtype=torch.float32`
|
||||
|
||||
#### 3. Missing `deepgemm_post_process` args
|
||||
- **Error**: `missing 2 required positional arguments: 'quant_block_shape' and 'use_e8m0'`
|
||||
- **Fix**: Pass `quant_block_shape=(128, 128)` and `use_e8m0=True`
|
||||
|
||||
#### 4. Compressor Indexer Shape Mismatch (CRITICAL)
|
||||
- **Error**: `split_with_sizes expects 2048, got split_sizes=[256, 256]`
|
||||
- **Cause**: `_reconstruct_compressor_weight` used wrong checkpoint prefix for indexer. Main compressor keys: `compressor.kv_proj.*`. Indexer keys: `compressor.indexer.kv_proj.*`. Loading main compressor weight into indexer's fused_wkv_wgate = 4× size mismatch.
|
||||
- **Fix**: Added `sub_path` parameter, pass `".indexer"` for indexer compressors.
|
||||
|
||||
#### 5. All-Ones Block Scale → Garbage Output (CRITICAL)
|
||||
- **Symptom**: Server runs, outputs tokens, but text is incoherent gibberish (repeating "Palm", "sulfuric", "东海")
|
||||
- **Cause**: Block scale was `torch.ones(...)` = 1.0. DeepGEMM divides by block scale at runtime, so output was divided by 1.0 instead of actual fp8_scale.
|
||||
- **Fix**: `torch.full(..., fp8_scale.item())` — fill each block with the per-tensor FP8 scale value.
|
||||
|
||||
### Conversion Summary
|
||||
- 61 NVFP4→FP8 (wo_a attention, DeepGEMM block-scale BMM einsum)
|
||||
- 0 BF16→FP8
|
||||
- 305 attn/shared→BF16 (UnquantizedLinearMethod)
|
||||
- 91 compressor→BF16 (reconstructed from separate NVFP4 kv_proj+gate_proj)
|
||||
- MoE experts: stay NVFP4 (FLASHINFER_TRTLLM FusedMoE backend)
|
||||
|
||||
### Architecture Map
|
||||
```
|
||||
wo_a → FP8 + DeepGEMM block scale (weight_scale_inv = dg_ws)
|
||||
fused_wqa_wkv, wo_b → BF16 (UnquantizedLinearMethod)
|
||||
compressor.fused_wkv_wgate → BF16 (read from checkpoint, unpack, dequant, cat)
|
||||
shared_expert → FP8 (Fp8LinearMethod, DeepGEMM)
|
||||
MoE w13/w2 → NVFP4 (FusedMoE, FLASHINFER_TRTLLM)
|
||||
```
|
||||
|
||||
### Key Code Locations
|
||||
- Patch: `/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py`
|
||||
- Runtime attention: `deepseek_v4_attention.py:319` — reads `wo_a.weight_scale_inv`
|
||||
- Runtime einsum: `deepseek_v4_fp8_einsum` → DeepGEMM `fp8_einsum`
|
||||
- DeepGEMM scale format: `deepgemm_post_process_fp8_weight_block` in `fp8_utils.py`
|
||||
- Compressor forward: `deepseek_compressor.py:281` — `kv, score = kv_score.split(...)`
|
||||
|
||||
### Outstanding Issues
|
||||
- **Output quality**: Still producing garbled text after block-scale fix. Need to verify the latest fix (fp8_scale in block scale) produces coherent output.
|
||||
- Possible causes if still garbled: subtle dequant bug, sign handling in E2M1 unpack, wrong scale ordering
|
||||
2416
patches/deepseek_v4.py
Normal file
2416
patches/deepseek_v4.py
Normal file
File diff suppressed because it is too large
Load Diff
1719
patches/deepseek_v4.py.bak
Normal file
1719
patches/deepseek_v4.py.bak
Normal file
File diff suppressed because it is too large
Load Diff
1799
patches/deepseek_v4.py.s11
Normal file
1799
patches/deepseek_v4.py.s11
Normal file
File diff suppressed because it is too large
Load Diff
1155
patches/deepseek_v4_attention.py
Normal file
1155
patches/deepseek_v4_attention.py
Normal file
File diff suppressed because it is too large
Load Diff
133
patches/nvfp4_linear.py
Normal file
133
patches/nvfp4_linear.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
NVFP4 Linear Method — runs BF16 input through DeepGEMM fp8_fp4_gemm natively.
|
||||
|
||||
Weight format: NVFP4 (E2M1 packed int8 + UE4M3 block16 scales + float32 global scale)
|
||||
Activation: BF16 → FP8 e4m3fn with UE8M0 per-token scales
|
||||
GEMM: deep_gemm.fp8_fp4_gemm_nn(a=(fp8, ue8m0_scale), b=(nvfp4_packed, float32_scale))
|
||||
Output: BF16
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from vllm.model_executor.layers.linear import LinearMethodBase
|
||||
|
||||
|
||||
class NVFP4LinearMethod(LinearMethodBase):
|
||||
"""Linear method that runs BF16 x NVFP4 via DeepGEMM fp8_fp4_gemm.
|
||||
|
||||
The layer must have:
|
||||
- weight: E2M1 packed int8 (2 values per byte), shape (N, K//2)
|
||||
- weight_scale: float8_e4m3fn UE4M3 block scales, shape (N, K//16)
|
||||
- weight_scale_2: float32 global scale, shape (num_logical_weights,)
|
||||
- input_scale: float32 activation scale (unused, dynamic quant)
|
||||
"""
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
pass
|
||||
|
||||
def process_weights_after_loading(self, layer: nn.Module) -> None:
|
||||
"""Fold global scale into block scales and prepare for DeepGEMM consumption."""
|
||||
w_data = layer.weight.data
|
||||
device = w_data.device
|
||||
|
||||
if w_data.dtype not in (torch.uint8, torch.int8):
|
||||
return
|
||||
|
||||
N = w_data.shape[0]
|
||||
K = w_data.shape[1] * 2 # unpacked K
|
||||
|
||||
# Get block scales
|
||||
sf_e4m3 = None
|
||||
for attr in ("weight_scale", "weight_scale_inv"):
|
||||
if hasattr(layer, attr):
|
||||
sf_e4m3 = getattr(layer, attr).data
|
||||
break
|
||||
assert sf_e4m3 is not None
|
||||
|
||||
# Get global scale
|
||||
if hasattr(layer, "weight_global_scale"):
|
||||
global_scale = layer.weight_global_scale.data.to(torch.float32)
|
||||
elif hasattr(layer, "weight_scale_2"):
|
||||
ws2 = layer.weight_scale_2.data
|
||||
if ws2.numel() > 1:
|
||||
logical_widths = getattr(layer, 'logical_widths', None)
|
||||
if logical_widths is not None and len(ws2) == len(logical_widths):
|
||||
expanded = []
|
||||
for i, w in enumerate(logical_widths):
|
||||
expanded.append(ws2[i:i+1].expand(w))
|
||||
global_scale = torch.cat(expanded).to(torch.float32).unsqueeze(1)
|
||||
else:
|
||||
global_scale = ws2.max().to(torch.float32)
|
||||
else:
|
||||
global_scale = ws2.max().to(torch.float32)
|
||||
else:
|
||||
global_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
|
||||
# Fold global scale into block scales and store as float32
|
||||
# (DeepGEMM fp8_fp4_gemm_nn expects float32 scales, NOT float8_e4m3fn)
|
||||
sf_f32 = sf_e4m3.to(torch.float32) * global_scale
|
||||
# Pad to align with gran_k=16 for DeepGEM
|
||||
sf_k = sf_f32.shape[1] # K//16
|
||||
gran_k = 16
|
||||
aligned_k = (sf_k + gran_k - 1) // gran_k * gran_k
|
||||
if aligned_k > sf_k:
|
||||
# Pad the scale tensor to be aligned
|
||||
sf_padded = torch.zeros(N, aligned_k, dtype=torch.float32, device=device)
|
||||
sf_padded[:, :sf_k] = sf_f32
|
||||
sf_f32 = sf_padded
|
||||
|
||||
layer.weight_scale_inv = nn.Parameter(sf_f32.contiguous(), requires_grad=False)
|
||||
del sf_f32, sf_e4m3
|
||||
|
||||
# Ensure weight is contiguous int8, K-major (required by DeepGEMM)
|
||||
if w_data.dtype == torch.uint8:
|
||||
layer.weight.data = w_data.view(torch.int8).contiguous()
|
||||
else:
|
||||
layer.weight.data = w_data.contiguous()
|
||||
|
||||
# Free source attributes
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale",
|
||||
"weight_global_scale", "input_global_scale",
|
||||
"alpha", "input_global_scale_inv"):
|
||||
if hasattr(layer, attr):
|
||||
delattr(layer, attr)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
import deep_gemm
|
||||
|
||||
M, K = x.shape
|
||||
|
||||
# Quantize activation to FP8 with UE8M0 per-token scales
|
||||
x_fp8, x_sf = deep_gemm.per_token_cast_to_fp8(
|
||||
x, use_ue8m0=True, use_packed_ue8m0=True)
|
||||
|
||||
# Weight: E2M1 packed int8 + folded float32 block scales
|
||||
b_weight = layer.weight.data # (N, K//2) int8
|
||||
b_sf = layer.weight_scale_inv.data # (N, K//16) float32
|
||||
|
||||
N = b_weight.shape[0]
|
||||
d = torch.empty((M, N), dtype=torch.bfloat16, device=x.device)
|
||||
|
||||
# DeepGEMM fp8_fp4_gemm: A is FP8 (M, K), B is FP4 (N, K//2 packed)
|
||||
# B scales are float32 with gran_k=16 (NVFP4 block size)
|
||||
deep_gemm.fp8_fp4_gemm_nn(
|
||||
a=(x_fp8, x_sf),
|
||||
b=(b_weight, b_sf),
|
||||
d=d,
|
||||
recipe_b=(1, 16), # NVFP4: gran_mn=1, gran_k=16
|
||||
)
|
||||
|
||||
return d
|
||||
66
patches/patch_finegrained_fp8_blackwell.py
Normal file
66
patches/patch_finegrained_fp8_blackwell.py
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Patch transformers' finegrained_fp8.py to reject DeepGEMM on Blackwell (SM100+).
|
||||
|
||||
DeepGEMM only supports Hopper (SM90). On Blackwell GPUs, _load_deepgemm_kernel()
|
||||
passes the SM90 check but then fails trying to download/load the kernel from HF Hub
|
||||
(rate limits, missing builds). This patch adds a check for SM100+ that raises
|
||||
ImportError, which the existing try/except in w8a8_fp8_matmul catches, falling
|
||||
back to the Triton finegrained-fp8 kernel.
|
||||
|
||||
Also needed because the Triton finegrained-fp8 matmul has shape mismatches during
|
||||
modelopt calibration (K mismatch on quantized expert weights). The real fix is to
|
||||
upcast the model to BF16 first (see scripts/upcast_to_bf16.py).
|
||||
|
||||
Usage:
|
||||
python3 patch_finegrained_fp8_blackwell.py [path_to_finegrained_fp8.py]
|
||||
|
||||
If no path given, auto-detects from the installed transformers package.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def patch(fp8_file: str):
|
||||
with open(fp8_file) as f:
|
||||
content = f.read()
|
||||
|
||||
old = """ # DeepGEMM requires Hopper (SM90) or newer for FP8 WGMMA instructions
|
||||
major = torch.cuda.get_device_capability()[0]
|
||||
if major < 9:
|
||||
raise ImportError(
|
||||
f"DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device "
|
||||
f"has compute capability {major}.x. Use a different `experts_implementation`."
|
||||
)"""
|
||||
|
||||
new = """ # DeepGEMM requires Hopper (SM90) specifically - not yet supported on Blackwell (SM100+)
|
||||
major = torch.cuda.get_device_capability()[0]
|
||||
if major < 9:
|
||||
raise ImportError(
|
||||
f"DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device "
|
||||
f"has compute capability {major}.x. Use a different `experts_implementation`."
|
||||
)
|
||||
if major >= 10:
|
||||
raise ImportError(
|
||||
f"DeepGEMM is not yet supported on Blackwell (SM100+). "
|
||||
f"Use a different `experts_implementation`."
|
||||
)"""
|
||||
|
||||
if old in content:
|
||||
content = content.replace(old, new)
|
||||
with open(fp8_file, "w") as f:
|
||||
f.write(content)
|
||||
print(f"PATCHED: {fp8_file} — DeepGEMM now rejected on Blackwell (SM100+)")
|
||||
else:
|
||||
print("Patch target not found (may already be patched or different version)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
fp8_file = sys.argv[1]
|
||||
else:
|
||||
import transformers.integrations.finegrained_fp8 as fp8
|
||||
import inspect
|
||||
fp8_file = inspect.getfile(fp8)
|
||||
patch(fp8_file)
|
||||
135
patches/patch_vllm_weights.py
Normal file
135
patches/patch_vllm_weights.py
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
|
||||
|
||||
modelopt exports weights with naming differences from what vllm's
|
||||
_make_deepseek_v4_weights_mapper + load_weights code expects:
|
||||
|
||||
1. Expert projections: modelopt uses gate_proj/up_proj/down_proj, vllm expects w1/w3/w2
|
||||
2. Shared expert projections: same gate_proj/up_proj naming, needs w1/w3 for stacking
|
||||
3. Compressor projections: kv_proj→wkv, gate_proj→wgate for fused stacking
|
||||
4. Attention projections: self_attn prefix, kv_proj→wkv for fused stacking, etc.
|
||||
5. Expert NVFP4 scales: weight_scale_2 and input_scale have no matching mega_moe params
|
||||
|
||||
CRITICAL: DeepseekV4ForCausalLM.hf_to_vllm_mapper is a CLASS attribute set at
|
||||
module import time. Simply patching _make_deepseek_v4_weights_mapper doesn't help
|
||||
because the class already cached the old mapper. We must also update the class
|
||||
attribute directly. Since expert_dtype=="fp4", __init__ doesn't recreate the mapper.
|
||||
|
||||
Drop into container as:
|
||||
python3 /patches/patch_vllm_weights.py
|
||||
|
||||
Or add to docker-compose.yml command before vllm serve.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
# Save original function BEFORE patching
|
||||
_original_make_mapper = None
|
||||
|
||||
|
||||
def make_patched_mapper(expert_dtype: str):
|
||||
"""Create a WeightsMapper with modelopt NVFP4 naming patches applied."""
|
||||
global _original_make_mapper
|
||||
# Use the saved original, not the (possibly patched) module attribute
|
||||
mapper = _original_make_mapper(expert_dtype)
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Regex mappings (applied FIRST by WeightsMapper, before substr)
|
||||
# Order matters: skip patterns must come before rename patterns.
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
ordered_regexes = {}
|
||||
|
||||
# Skip expert NVFP4 scales that have no mega_moe params.
|
||||
# MUST come before gate_proj→w1 etc. because after renaming,
|
||||
# the key has "w1." not "gate_proj." and these patterns wouldn't match.
|
||||
#
|
||||
# modelopt's NVFP4 export includes weight_scale_2 (global scale) and
|
||||
# input_scale (activation scale) for each expert projection. But the
|
||||
# DeepseekV4MegaMoEExperts module only registers w13_weight_scale and
|
||||
# w2_weight_scale (E8M0 block scales) — no weight_scale_2 or input_scale.
|
||||
# Mapping to None tells WeightsMapper to skip these weights entirely.
|
||||
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.weight_scale_2$")] = None
|
||||
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.input_scale$")] = None
|
||||
|
||||
# Routed expert projections: gate_proj→w1, up_proj→w3, down_proj→w2
|
||||
# We use regex (not substr) to match ONLY .experts.N. — NOT .shared_experts.
|
||||
# Using substr ".down_proj." → ".w2." would also affect
|
||||
# shared_experts.down_proj, breaking shared expert loading
|
||||
# (vllm model uses down_proj, not w2, for shared experts).
|
||||
ordered_regexes[re.compile(r"(\.experts\.\d+\.)gate_proj\.")] = r"\1w1."
|
||||
ordered_regexes[re.compile(r"(\.experts\.\d+\.)up_proj\.")] = r"\1w3."
|
||||
ordered_regexes[re.compile(r"(\.experts\.\d+\.)down_proj\.")] = r"\1w2."
|
||||
|
||||
# Preserve any existing regex mappings from the original mapper
|
||||
if mapper.orig_to_new_regex:
|
||||
ordered_regexes.update(mapper.orig_to_new_regex)
|
||||
|
||||
mapper.orig_to_new_regex = ordered_regexes
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Substr mappings (applied AFTER regex by WeightsMapper)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
# 1. Attention: self_attn → attn.mla_attn mappings
|
||||
# modelopt uses "self_attn" but vllm expects "attn" (mapped to "attn.mla_attn")
|
||||
mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
|
||||
mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
|
||||
mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
|
||||
mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
|
||||
mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
|
||||
mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
|
||||
|
||||
# CRITICAL: kv_proj must map to wkv (not kv_proj) because the stacking
|
||||
# code looks for "attn.wkv" to stack into fused_wqa_wkv.
|
||||
mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.wkv."
|
||||
mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
|
||||
|
||||
# Compressor: self_attn.compressor → attn.mla_attn.compressor
|
||||
mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
|
||||
|
||||
# Compressor projection renaming for stacking:
|
||||
# vllm stacks compressor.wkv + compressor.wgate → compressor.fused_wkv_wgate
|
||||
# modelopt exports as compressor.kv_proj and compressor.gate_proj
|
||||
mapper.orig_to_new_substr[".compressor.kv_proj."] = ".compressor.wkv."
|
||||
mapper.orig_to_new_substr[".compressor.gate_proj."] = ".compressor.wgate."
|
||||
|
||||
# 2. Shared expert projections: gate_proj→w1, up_proj→w3
|
||||
# vllm stacks shared_experts.w1 + shared_experts.w3 into
|
||||
# shared_experts.gate_up_proj. modelopt uses gate_proj/up_proj naming.
|
||||
# down_proj stays as-is (vllm model uses down_proj directly).
|
||||
mapper.orig_to_new_substr[".shared_experts.gate_proj."] = ".shared_experts.w1."
|
||||
mapper.orig_to_new_substr[".shared_experts.up_proj."] = ".shared_experts.w3."
|
||||
|
||||
return mapper
|
||||
|
||||
|
||||
def patch():
|
||||
global _original_make_mapper
|
||||
from vllm.model_executor.models import deepseek_v4
|
||||
|
||||
# 1. Save the original function BEFORE replacing it
|
||||
_original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
|
||||
|
||||
# 2. Patch the function so __init__ calls also get our mapper
|
||||
deepseek_v4._make_deepseek_v4_weights_mapper = make_patched_mapper
|
||||
print("✓ Patched _make_deepseek_v4_weights_mapper function")
|
||||
|
||||
# 3. CRITICAL: Also update the CLASS attribute directly.
|
||||
# DeepseekV4ForCausalLM.hf_to_vllm_mapper is set at class definition
|
||||
# time (module import). Our function patch above doesn't retroactively
|
||||
# update it. Since expert_dtype=="fp4", __init__ won't recreate it either.
|
||||
# We MUST update the class attribute directly.
|
||||
if hasattr(deepseek_v4, 'DeepseekV4ForCausalLM'):
|
||||
deepseek_v4.DeepseekV4ForCausalLM.hf_to_vllm_mapper = make_patched_mapper("fp4")
|
||||
print("✓ Updated DeepseekV4ForCausalLM.hf_to_vllm_mapper class attribute")
|
||||
else:
|
||||
print("⚠ DeepseekV4ForCausalLM not found (will be patched at import time)")
|
||||
|
||||
print("✓ All modelopt NVFP4 weight mapping patches applied")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
patch()
|
||||
295
patches/quant_module_patched.py
Normal file
295
patches/quant_module_patched.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Base class for quantization modules."""
|
||||
|
||||
import contextlib
|
||||
import warnings
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from modelopt.torch.opt.dynamic import DynamicModule, _DMRegistryCls
|
||||
from modelopt.torch.utils.distributed import ParallelState
|
||||
|
||||
from ...tensor_quant import QUANT_DESC_8BIT_PER_TENSOR
|
||||
from ...utils import is_torch_export_mode
|
||||
from .tensor_quantizer import SequentialQuantizer, TensorQuantizer
|
||||
|
||||
__all__ = [
|
||||
"QuantInputBase",
|
||||
"QuantLinearConvBase",
|
||||
"QuantModule",
|
||||
"QuantModuleRegistry",
|
||||
]
|
||||
|
||||
|
||||
class QuantModule(DynamicModule):
|
||||
"""A base class for quantized modules.
|
||||
|
||||
In addition, the class also provides ``parallel_state`` attribute that can be used to access
|
||||
the parallel state of the module.
|
||||
"""
|
||||
|
||||
_parallel_state: ParallelState
|
||||
|
||||
@classmethod
|
||||
@torch.no_grad()
|
||||
def convert(cls, module: nn.Module, **setup_kwargs: Any) -> "QuantModule":
|
||||
"""Convert the module to a dynamic module."""
|
||||
module = super().convert(module, **setup_kwargs)
|
||||
|
||||
# setup parallel state now that the module is converted
|
||||
if module.parallel_state is None:
|
||||
module._initialize_parallel_state()
|
||||
|
||||
return module
|
||||
|
||||
@property
|
||||
def parallel_state(self) -> ParallelState | None:
|
||||
"""Return the parallel state of the quant module."""
|
||||
return getattr(self, "_parallel_state", None)
|
||||
|
||||
@parallel_state.setter
|
||||
def parallel_state(self, parallel_state: ParallelState):
|
||||
"""Set the parallel state of the dynamic module."""
|
||||
assert isinstance(parallel_state, ParallelState), (
|
||||
"parallel_state must be a ParallelState object!"
|
||||
)
|
||||
self._parallel_state = parallel_state
|
||||
|
||||
def _initialize_parallel_state(self):
|
||||
"""Initialize the parallel state of the dynamic module.
|
||||
|
||||
This method is called only if the `QuantModule` does not have a `parallel_state` attribute
|
||||
after `_setup` is called.
|
||||
"""
|
||||
if torch.distributed.is_initialized():
|
||||
warnings.warn(
|
||||
f"Distributed training is initialized but no parallel_state is set for {type(self)}. "
|
||||
"Using default parallel_state which has data_parallel_group set to the default process group and "
|
||||
"tensor_parallel_group is unspecified. "
|
||||
"If you are using tensor parallelism for this module, you should set the parallel_state "
|
||||
"in its `_setup` method."
|
||||
)
|
||||
|
||||
self.parallel_state = ParallelState(data_parallel_group=None)
|
||||
|
||||
def modelopt_post_restore(self, prefix: str = ""):
|
||||
"""Post-restore to correctly configure the TensorQuantizer states.
|
||||
|
||||
TensorQuantizer states are restored to their shape before saving. Now we need to further configure them.
|
||||
1. For non-sharded modules this simply involves moving the TensorQuantizer states to the right device.
|
||||
This applies for regular Pytorch models and HuggingFace models.
|
||||
2. For sharded modules the restored states of TensorQuantizer could be incorrect. This is because
|
||||
parallelism such as TP might have been changed between saving and resoring. So we need to re-calculate
|
||||
the state shapes. Hence such modules should override this and implement their own logic.
|
||||
"""
|
||||
# Get a parameter or buffer that does not belong to a TensorQuantizer
|
||||
non_tq_param_or_buffer = None
|
||||
for name, param_or_buffer in self.state_dict().items():
|
||||
parent = self.get_submodule(name.rsplit(".", 1)[0]) if "." in name else self
|
||||
if not isinstance(parent, TensorQuantizer):
|
||||
non_tq_param_or_buffer = param_or_buffer
|
||||
break
|
||||
|
||||
if non_tq_param_or_buffer is None:
|
||||
warnings.warn(
|
||||
f"Could not identify the device for TensorQuantizer states of {prefix}. "
|
||||
"Please move the model to the right device now. This can be done by calling "
|
||||
"`model.to(device)`."
|
||||
)
|
||||
return
|
||||
|
||||
# Move the TensorQuantizer states to the right device (dtype should have been restored).
|
||||
for module in self.modules():
|
||||
if isinstance(module, TensorQuantizer):
|
||||
module.to(non_tq_param_or_buffer.device)
|
||||
|
||||
def iter_weights_for_calibration(self):
|
||||
"""Yield ``(weight, weight_quantizer)`` pairs for weight-only calibration."""
|
||||
import torch.nn as nn
|
||||
from modelopt.torch.quantization.utils import quantizer_attr_names, weight_attr_names
|
||||
|
||||
for weight_name in weight_attr_names(self):
|
||||
qname = quantizer_attr_names(weight_name).weight_quantizer
|
||||
qattr = getattr(self, qname, None)
|
||||
weight = getattr(self, weight_name)
|
||||
if qattr is not None:
|
||||
# Singular quantizer
|
||||
yield weight, qattr
|
||||
else:
|
||||
# Try plural (ModuleList) - e.g. _QuantFusedExperts
|
||||
plural = qname + "s"
|
||||
qattr = getattr(self, plural, None)
|
||||
if isinstance(qattr, nn.ModuleList):
|
||||
# Yield per-expert slices for 3-D fused weights
|
||||
if weight.dim() == 3:
|
||||
for idx, q in enumerate(qattr):
|
||||
yield weight[idx], q
|
||||
else:
|
||||
for q in qattr:
|
||||
yield weight, q
|
||||
else:
|
||||
raise AttributeError(
|
||||
f"Cannot find weight quantizer {qname} or {plural} on {type(self).__name__}"
|
||||
)
|
||||
|
||||
def fold_weight(self, keep_attrs: bool = False):
|
||||
"""Fold the weight for faster eval."""
|
||||
# Handle all attributes that end with _weight_quantizer
|
||||
for name in dir(self):
|
||||
attr = getattr(self, name)
|
||||
if (
|
||||
name.endswith("weight_quantizer")
|
||||
and isinstance(attr, TensorQuantizer)
|
||||
and attr.fake_quant
|
||||
):
|
||||
# Get the corresponding weight name by removing _weight_quantizer suffix
|
||||
weight_name = name[:-10]
|
||||
|
||||
assert hasattr(self, weight_name), (
|
||||
f"{name} doesn't have a corresponding {weight_name} in {self.__class__.__name__}"
|
||||
)
|
||||
weight = getattr(self, weight_name)
|
||||
weight.data.copy_(attr(weight.float()).to(weight.dtype))
|
||||
attr.disable()
|
||||
if not keep_attrs:
|
||||
_attrs = [
|
||||
"_pre_quant_scale",
|
||||
"_amax",
|
||||
]
|
||||
for attr_name in _attrs:
|
||||
if hasattr(attr, attr_name):
|
||||
delattr(attr, attr_name)
|
||||
|
||||
|
||||
QuantModuleRegistry = _DMRegistryCls("Quant", QuantModule)
|
||||
|
||||
|
||||
class QuantInputBase(QuantModule):
|
||||
"""Base class for modules where the input is quantized."""
|
||||
|
||||
input_quantizer: TensorQuantizer
|
||||
output_quantizer: TensorQuantizer
|
||||
default_quant_desc_input = QUANT_DESC_8BIT_PER_TENSOR
|
||||
default_quant_desc_output = QUANT_DESC_8BIT_PER_TENSOR
|
||||
|
||||
def forward(self, input, *args, **kwargs):
|
||||
"""Quantize the input before calling the original forward method."""
|
||||
input = self.input_quantizer(input)
|
||||
# Check MR: https://github.com/NVIDIA/Model-Optimizer/pull/824
|
||||
if hasattr(self, "_forward_pre_dm"):
|
||||
pre_fwd = getattr(self, "_forward_pre_dm")
|
||||
|
||||
def _is_forward_in_mro(bound_or_func) -> bool:
|
||||
# If this is a bound method, compare its underlying function to any `forward`
|
||||
# implementation in the current MRO. If it matches, it's not an external monkey-patch.
|
||||
if hasattr(bound_or_func, "__func__"):
|
||||
fn = bound_or_func.__func__
|
||||
for cls in type(self).mro():
|
||||
if cls.__dict__.get("forward") is fn:
|
||||
return True
|
||||
return False
|
||||
|
||||
if pre_fwd is getattr(self, "forward") or _is_forward_in_mro(pre_fwd):
|
||||
output = super().forward(input, *args, **kwargs)
|
||||
else:
|
||||
output = pre_fwd(input, *args, **kwargs)
|
||||
else:
|
||||
output = super().forward(input, *args, **kwargs)
|
||||
if isinstance(output, tuple):
|
||||
return (self.output_quantizer(output[0]), *output[1:])
|
||||
return self.output_quantizer(output)
|
||||
|
||||
def _setup(self):
|
||||
"""Patch the module's forward method to quantize the input."""
|
||||
self._register_temp_attribute(
|
||||
"input_quantizer", TensorQuantizer(self.default_quant_desc_input)
|
||||
)
|
||||
self._register_temp_attribute(
|
||||
"output_quantizer", TensorQuantizer(self.default_quant_desc_output)
|
||||
)
|
||||
self.output_quantizer.disable()
|
||||
|
||||
|
||||
class QuantLinearConvBase(QuantInputBase):
|
||||
"""Base class for quantized linear modules.
|
||||
|
||||
Quantized linear modules are modules where both the input and the weight are quantized.
|
||||
"""
|
||||
|
||||
weight_quantizer: TensorQuantizer | SequentialQuantizer
|
||||
_enable_weight_quantization: bool
|
||||
default_quant_desc_weight = QUANT_DESC_8BIT_PER_TENSOR
|
||||
|
||||
@contextlib.contextmanager
|
||||
def quantize_weight(self):
|
||||
"""Context in which `self.weight` is quantized."""
|
||||
self._enable_weight_quantization = True
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._enable_weight_quantization = False
|
||||
|
||||
@staticmethod
|
||||
def _get_quantized_weight(module: "QuantLinearConvBase", weight: torch.Tensor) -> torch.Tensor:
|
||||
if module._enable_weight_quantization or is_torch_export_mode():
|
||||
return module.weight_quantizer(weight)
|
||||
return weight
|
||||
|
||||
def forward(self, input, *args, **kwargs):
|
||||
"""Quantize the input and the weight before calling the original forward method."""
|
||||
# self.quntize_weight() setting attributes is not allowed for torch.export.
|
||||
if is_torch_export_mode():
|
||||
return super().forward(input, *args, **kwargs)
|
||||
|
||||
with self.quantize_weight():
|
||||
return super().forward(input, *args, **kwargs)
|
||||
|
||||
def _setup(self):
|
||||
super()._setup()
|
||||
self._register_temp_attribute(
|
||||
"weight_quantizer", TensorQuantizer(self.default_quant_desc_weight)
|
||||
)
|
||||
self._register_temp_attribute("_enable_weight_quantization", False)
|
||||
self._register_dynamic_attribute("weight", self._get_quantized_weight)
|
||||
|
||||
|
||||
class _LegacyQuantInputBaseMixin:
|
||||
"""A mixin to support legacy quantized modules which needs to have an __init__ method."""
|
||||
|
||||
_quantized_cls = QuantInputBase
|
||||
default_quant_desc_input = QUANT_DESC_8BIT_PER_TENSOR
|
||||
default_quant_desc_output = QUANT_DESC_8BIT_PER_TENSOR
|
||||
|
||||
def __init__(self, *args, quant_desc_input=None, **kwargs):
|
||||
"""Initialize the module with its original __init__ and patch its forward."""
|
||||
self.default_quant_desc_input = quant_desc_input or self.default_quant_desc_input
|
||||
super().__init__(*args, **kwargs)
|
||||
QuantModuleRegistry.convert(self)
|
||||
|
||||
|
||||
class _LegacyQuantLinearConvBaseMixin(_LegacyQuantInputBaseMixin):
|
||||
"""A mixin to support legacy quantized modules which needs to have an __init__ method."""
|
||||
|
||||
_quantized_cls = QuantLinearConvBase
|
||||
default_quant_desc_weight = QUANT_DESC_8BIT_PER_TENSOR
|
||||
|
||||
def __init__(self, *args, quant_desc_input=None, quant_desc_weight=None, **kwargs):
|
||||
"""Initialize the module with its original __init__ and patch its forward."""
|
||||
self.default_quant_desc_weight = quant_desc_weight or self.default_quant_desc_weight
|
||||
super().__init__(*args, quant_desc_input=quant_desc_input, **kwargs)
|
||||
270
patches/staging_kernel.py
Normal file
270
patches/staging_kernel.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
NVFP4 staging kernel — full FP4 (E2M1) activations + UE4M3 block16 scales.
|
||||
|
||||
The mxf4nvf4 PTX instruction requires BOTH A and B to be FP4 (E2M1 packed).
|
||||
This kernel quantizes BF16 activations → E2M1 packed uint8 with UE4M3 scales.
|
||||
"""
|
||||
import triton
|
||||
import triton.language as tl
|
||||
import torch
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _deepseek_v4_stage_mega_moe_inputs_kernel(
|
||||
hidden_states,
|
||||
x_fp4, # uint8, shape (M, K//2) — E2M1 packed, 2 values per byte
|
||||
x_sf, # int32, shape (M, K//64) — UE4M3 packed, 4 scales per int32
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
topk_idx_out,
|
||||
topk_weights_out,
|
||||
hidden_stride_m: tl.constexpr,
|
||||
hidden_stride_k: tl.constexpr,
|
||||
x_stride_m: tl.constexpr,
|
||||
x_stride_k: tl.constexpr,
|
||||
x_sf_stride_m: tl.constexpr,
|
||||
x_sf_stride_k: tl.constexpr,
|
||||
topk_ids_stride_m: tl.constexpr,
|
||||
topk_ids_stride_k: tl.constexpr,
|
||||
topk_weights_stride_m: tl.constexpr,
|
||||
topk_weights_stride_k: tl.constexpr,
|
||||
topk_idx_stride_m: tl.constexpr,
|
||||
topk_idx_stride_k: tl.constexpr,
|
||||
topk_weights_out_stride_m: tl.constexpr,
|
||||
topk_weights_out_stride_k: tl.constexpr,
|
||||
hidden_size: tl.constexpr,
|
||||
top_k: tl.constexpr,
|
||||
BLOCK_K: tl.constexpr, # 128 elements (loaded from hidden)
|
||||
GROUP_K: tl.constexpr, # 16 (NVFP4 group_size)
|
||||
BLOCK_TOPK: tl.constexpr,
|
||||
) -> None:
|
||||
token_id = tl.program_id(0)
|
||||
k_block_id = tl.program_id(1)
|
||||
|
||||
k_offsets = k_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
|
||||
k_mask = k_offsets < hidden_size
|
||||
hidden = tl.load(
|
||||
hidden_states + token_id * hidden_stride_m + k_offsets * hidden_stride_k,
|
||||
mask=k_mask,
|
||||
other=0.0,
|
||||
).to(tl.float32)
|
||||
|
||||
num_groups: tl.constexpr = BLOCK_K // GROUP_K # 8
|
||||
hidden_groups = tl.reshape(hidden, [num_groups, GROUP_K])
|
||||
abs_groups = tl.reshape(tl.abs(hidden), [num_groups, GROUP_K])
|
||||
amax = tl.max(abs_groups, axis=1)
|
||||
amax = tl.maximum(amax, 1.0e-4)
|
||||
|
||||
# ---- UE4M3 scale computation ----
|
||||
# scale = amax / 6.0 (E2M1 max value = 6)
|
||||
# Then quantize scale to UE4M3 format
|
||||
scale = amax / 6.0
|
||||
scale_bits = scale.to(tl.uint32, bitcast=True)
|
||||
scale_exp = (scale_bits >> 23) & 0xFF
|
||||
scale_mant = scale_bits & 0x7FFFFF
|
||||
|
||||
# Convert FP32 → E4M3 manually (with subnormal support)
|
||||
# FP32 bias=127, E4M3 bias=7 → raw exp = scale_exp - 120
|
||||
e4m3_exp_raw = scale_exp - 120 # can be negative → subnormal
|
||||
|
||||
# Normal path: exp >= 1, just truncate mantissa top 3 bits
|
||||
# RNE rounding: need guard (bit 19), sticky (OR of bits 18:0), and LSB of result
|
||||
normal_mant = scale_mant >> 20
|
||||
guard_bit = (scale_mant >> 19) & 1
|
||||
sticky_bit = tl.where((scale_mant & 0x7FFFF) != 0, 1, 0) # OR of bits [18:0]
|
||||
result_lsb = normal_mant & 1
|
||||
# RNE: round up if (guard=1 and sticky=1) or (guard=1 and sticky=0 and lsb=1)
|
||||
round_up = guard_bit & (sticky_bit | result_lsb)
|
||||
normal_mant = normal_mant + round_up
|
||||
normal_exp = e4m3_exp_raw
|
||||
|
||||
# Subnormal path: exp_raw <= 0
|
||||
# Insert implicit leading 1 and right-shift by (1 - exp_raw)
|
||||
# E4M3 subnormal: value = (mant/8) * 2^(1-7) = (mant/8) * 2^-6
|
||||
# So we need: (1 + mant_fp32/2^23) * 2^(exp_raw - 7) = (shifted_mant/8) * 2^-6
|
||||
# shifted_mant = (implicit_1 | mant_fp32) >> (1 - exp_raw - 1) then take top 3 bits
|
||||
shift = 1 - e4m3_exp_raw # positive when subnormal
|
||||
mant_with_leading = (0x800000 | scale_mant) # insert implicit 1
|
||||
# Right-shift to get into the 3-bit E4M3 mantissa window
|
||||
# We want bits [shift+19 : shift+23) of mant_with_leading for 3 mantissa bits + 1 round bit
|
||||
subnormal_mant = (mant_with_leading >> (shift.to(tl.int32) + 20)) & 0x7
|
||||
sub_guard_bit = (mant_with_leading >> (shift.to(tl.int32) + 19)) & 1
|
||||
# Sticky: OR of all bits below the guard bit in the shifted result
|
||||
# shift ≤ 8 in practice (amax floor = 1e-4 → scale ≈ 2^-15 → exp_raw ≈ -7), so mask ≤ 2^27
|
||||
sub_sticky_mask = (1 << (shift.to(tl.int32) + 19)) - 1
|
||||
sub_sticky_bit = tl.where((mant_with_leading & sub_sticky_mask) != 0, 1, 0)
|
||||
sub_result_lsb = subnormal_mant & 1
|
||||
sub_round_up = sub_guard_bit & (sub_sticky_bit | sub_result_lsb)
|
||||
subnormal_mant = subnormal_mant + sub_round_up
|
||||
|
||||
is_normal = e4m3_exp_raw >= 1
|
||||
e4m3_mant = tl.where(is_normal, normal_mant, subnormal_mant)
|
||||
e4m3_exp = tl.where(is_normal, normal_exp, 0) # exp=0 for subnormals
|
||||
|
||||
# Handle mantissa overflow after rounding
|
||||
overflow = e4m3_mant >= 8
|
||||
e4m3_mant = tl.where(overflow, 0, e4m3_mant)
|
||||
e4m3_exp = tl.where(overflow, e4m3_exp + 1, e4m3_exp)
|
||||
e4m3_exp = tl.maximum(e4m3_exp, 0)
|
||||
e4m3_exp = tl.minimum(e4m3_exp, 15)
|
||||
scale_e4m3_bits = (e4m3_exp << 3) | e4m3_mant
|
||||
|
||||
# Reconstruct dequantized scale by decoding the STORED E4M3 bits.
|
||||
# This guarantees the E2M1 quantization divides by exactly the value
|
||||
# the CUDA kernel will multiply back — same bits, single decode, no
|
||||
# possibility of encode/decode disagreement.
|
||||
stored_exp = (scale_e4m3_bits >> 3) & 0xF
|
||||
stored_mant = scale_e4m3_bits & 0x7
|
||||
e4m3_exp_for_recon = tl.maximum(stored_exp.to(tl.int32) - 7, -126)
|
||||
two_pow_exp_bits = (e4m3_exp_for_recon + 127).to(tl.uint32) << 23
|
||||
two_pow_exp = two_pow_exp_bits.to(tl.float32, bitcast=True)
|
||||
normal_value = (1.0 + stored_mant.to(tl.float32) / 8.0) * two_pow_exp
|
||||
subnormal_value = (stored_mant.to(tl.float32) / 8.0) * 0.015625
|
||||
e4m3_value = tl.where(stored_exp == 0, subnormal_value, normal_value)
|
||||
|
||||
# ---- E2M1 FP4 quantization (unpacked, 1 byte/element) ----
|
||||
# E2M1 LUT (unsigned): [0, 0.5, 1, 1.5, 2, 3, 4, 6]
|
||||
# Nearest-neighbor using thresholds (midpoints between consecutive values)
|
||||
scaled = hidden_groups * (1.0 / tl.maximum(e4m3_value, 1e-6))[:, None]
|
||||
# Clamp to E2M1 range [-6, 6]
|
||||
scaled = tl.maximum(scaled, -6.0)
|
||||
scaled = tl.minimum(scaled, 6.0)
|
||||
|
||||
abs_s = tl.abs(scaled)
|
||||
# Thresholds: midpoints between [0, 0.5, 1, 1.5, 2, 3, 4, 6]
|
||||
# [0, 0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0, INF]
|
||||
e2m1_idx = tl.where(abs_s < 0.25, 0,
|
||||
tl.where(abs_s < 0.75, 1,
|
||||
tl.where(abs_s < 1.25, 2,
|
||||
tl.where(abs_s < 1.75, 3,
|
||||
tl.where(abs_s < 2.5, 4,
|
||||
tl.where(abs_s < 3.5, 5,
|
||||
tl.where(abs_s < 5.0, 6, 7)))))))
|
||||
sign_bit = (scaled < 0).to(tl.int32)
|
||||
e2m1_4bit = (sign_bit << 3) | e2m1_idx # 4-bit: (sign << 3) | index
|
||||
|
||||
# Pack E2M1 pairs into single bytes (2 per byte, low nibble first)
|
||||
# mxf4nvf4 reads FP4 packed from SMEM — must match kernel's TMA layout
|
||||
e2m1_flat = tl.reshape(e2m1_4bit, [BLOCK_K])
|
||||
e2m1_lo = e2m1_flat[0::2] # even indices → low nibble
|
||||
e2m1_hi = e2m1_flat[1::2] # odd indices → high nibble
|
||||
e2m1_packed = (e2m1_hi << 4 | e2m1_lo).to(tl.uint8) # [BLOCK_K // 2]
|
||||
|
||||
k_offsets_out = k_block_id * (BLOCK_K // 2) + tl.arange(0, BLOCK_K // 2)
|
||||
k_mask_out = k_offsets_out < (hidden_size // 2)
|
||||
tl.store(
|
||||
x_fp4 + token_id * x_stride_m + k_offsets_out * x_stride_k,
|
||||
e2m1_packed,
|
||||
mask=k_mask_out,
|
||||
)
|
||||
|
||||
# Pack UE4M3 bytes into int32 (NVFP4: group_size=16, 4 groups per 64 elements)
|
||||
# 8 groups per k_block of 128 → 2 int32s per k_block
|
||||
# int32 can only pack 4 bytes (shifts >= 32 are UB), so split into two packs
|
||||
scale_offsets = tl.arange(0, num_groups) # [0..7]
|
||||
first_half = scale_offsets < 4 # groups 0-3 → int32[0]
|
||||
second_half = scale_offsets >= 4 # groups 4-7 → int32[1]
|
||||
|
||||
packed_lo = tl.sum(
|
||||
tl.where(first_half, scale_e4m3_bits.to(tl.int32) << (scale_offsets * 8), 0),
|
||||
axis=0,
|
||||
).to(tl.int32)
|
||||
packed_hi = tl.sum(
|
||||
tl.where(second_half, scale_e4m3_bits.to(tl.int32) << ((scale_offsets - 4) * 8), 0),
|
||||
axis=0,
|
||||
).to(tl.int32)
|
||||
|
||||
# Write 2 int32s per k_block: x_sf shape is (M, K//64) = (M, num_k_blocks * 2)
|
||||
sf_base = token_id * x_sf_stride_m + k_block_id * 2 * x_sf_stride_k
|
||||
tl.store(x_sf + sf_base, packed_lo)
|
||||
tl.store(x_sf + sf_base + x_sf_stride_k, packed_hi)
|
||||
|
||||
if k_block_id == 0:
|
||||
topk_offsets = tl.arange(0, BLOCK_TOPK)
|
||||
topk_mask = topk_offsets < top_k
|
||||
|
||||
ids = tl.load(
|
||||
topk_ids + token_id * topk_ids_stride_m + topk_offsets * topk_ids_stride_k,
|
||||
mask=topk_mask,
|
||||
other=0,
|
||||
).to(tl.int64)
|
||||
tl.store(
|
||||
topk_idx_out
|
||||
+ token_id * topk_idx_stride_m
|
||||
+ topk_offsets * topk_idx_stride_k,
|
||||
ids,
|
||||
mask=topk_mask,
|
||||
)
|
||||
|
||||
weights = tl.load(
|
||||
topk_weights
|
||||
+ token_id * topk_weights_stride_m
|
||||
+ topk_offsets * topk_weights_stride_k,
|
||||
mask=topk_mask,
|
||||
other=0.0,
|
||||
)
|
||||
tl.store(
|
||||
topk_weights_out
|
||||
+ token_id * topk_weights_out_stride_m
|
||||
+ topk_offsets * topk_weights_out_stride_k,
|
||||
weights,
|
||||
mask=topk_mask,
|
||||
)
|
||||
|
||||
|
||||
def _stage_deepseek_v4_mega_moe_inputs(
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
x_fp4: torch.Tensor, # uint8, shape (M, K//2)
|
||||
x_sf: torch.Tensor, # int32, shape (M, K//64)
|
||||
topk_idx_out: torch.Tensor,
|
||||
topk_weights_out: torch.Tensor,
|
||||
) -> None:
|
||||
num_tokens, hidden_size = hidden_states.shape
|
||||
if num_tokens == 0:
|
||||
return
|
||||
if hidden_size % 128 != 0:
|
||||
raise ValueError(
|
||||
"DeepSeek V4 MegaMoE input staging requires hidden_size to be "
|
||||
"a multiple of 128."
|
||||
)
|
||||
top_k = topk_ids.shape[1]
|
||||
if topk_weights.shape != topk_ids.shape:
|
||||
raise ValueError(
|
||||
"DeepSeek V4 MegaMoE input staging requires topk_weights and "
|
||||
"topk_ids to have the same shape."
|
||||
)
|
||||
|
||||
block_k = 128
|
||||
grid = (num_tokens, triton.cdiv(hidden_size, block_k))
|
||||
block_topk = triton.next_power_of_2(top_k)
|
||||
_deepseek_v4_stage_mega_moe_inputs_kernel[grid](
|
||||
hidden_states,
|
||||
x_fp4,
|
||||
x_sf,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
topk_idx_out,
|
||||
topk_weights_out,
|
||||
hidden_states.stride(0),
|
||||
hidden_states.stride(1),
|
||||
x_fp4.stride(0),
|
||||
x_fp4.stride(1),
|
||||
x_sf.stride(0),
|
||||
x_sf.stride(1),
|
||||
topk_ids.stride(0),
|
||||
topk_ids.stride(1),
|
||||
topk_weights.stride(0),
|
||||
topk_weights.stride(1),
|
||||
topk_idx_out.stride(0),
|
||||
topk_idx_out.stride(1),
|
||||
topk_weights_out.stride(0),
|
||||
topk_weights_out.stride(1),
|
||||
hidden_size,
|
||||
top_k,
|
||||
BLOCK_K=block_k,
|
||||
GROUP_K=16, # NVFP4: group_size=16 (scale_vec::4X)
|
||||
BLOCK_TOPK=block_topk,
|
||||
num_warps=4,
|
||||
)
|
||||
@@ -1,218 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.
|
||||
|
||||
Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with
|
||||
calibrated activation global scales. Higher quality than the streaming converter
|
||||
on activation-sensitive ops, at the cost of much longer wall time and more
|
||||
fragility on a brand-new architecture.
|
||||
|
||||
Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):
|
||||
- FP8 base resident in CPU RAM: ~865 GB
|
||||
- One transformer block on GPU at a time: ~10-30 GB HBM
|
||||
- Activation calibration cache: tens to a few hundred GB
|
||||
- Headroom: ~1.5+ TB RAM, ~1.4+ TB HBM
|
||||
|
||||
Critical: this loads the model with trust_remote_code=True. V4 architecture is
|
||||
brand new; expect to need:
|
||||
- transformers from source (or recent main)
|
||||
- llm-compressor from source
|
||||
- The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable
|
||||
|
||||
Usage:
|
||||
python quantize_llmcompressor.py \\
|
||||
--src DeepSeek-V4-Pro-FP8 \\
|
||||
--dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\
|
||||
--num-samples 256 \\
|
||||
--max-seq-len 4096
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--src", required=True, help="Source FP8 model directory")
|
||||
ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
|
||||
ap.add_argument("--num-samples", type=int, default=256)
|
||||
ap.add_argument("--max-seq-len", type=int, default=4096)
|
||||
ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")
|
||||
ap.add_argument(
|
||||
"--offload-folder", default="/root/nvidia-meeting/.offload",
|
||||
help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--no-activation-quant", action="store_true",
|
||||
help="Quantize weights only (no activation calibration). Faster, closer to Path A."
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
src = Path(args.src).resolve()
|
||||
dst = Path(args.dst).resolve()
|
||||
if not (src / "config.json").exists():
|
||||
sys.exit(f"No config.json at {src}")
|
||||
|
||||
Path(args.offload_folder).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Heavy imports happen here so --help is fast
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datasets import load_dataset
|
||||
from llmcompressor import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 1. Load model
|
||||
# ----------------------------------------------------------------------
|
||||
print(f"Loading {src} ...")
|
||||
print(" This will take several minutes — FP8 base is ~865 GB.")
|
||||
|
||||
# We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16
|
||||
# when each block goes to GPU during sequential calibration. The exact
|
||||
# behavior depends on transformers' V4 modeling code — if it auto-dequants
|
||||
# on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
|
||||
tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
src,
|
||||
torch_dtype="auto",
|
||||
device_map="cpu", # all on CPU; sequential pipeline moves blocks to GPU
|
||||
trust_remote_code=True,
|
||||
offload_folder=args.offload_folder,
|
||||
)
|
||||
print(f" Model class: {type(model).__name__}")
|
||||
print(f" Param count: {sum(p.numel() for p in model.parameters()):,}")
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 2. MoE handling — replace_modules_for_calibration
|
||||
# ----------------------------------------------------------------------
|
||||
# On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes
|
||||
# every expert during calibration (otherwise routed-only experts never see
|
||||
# data). For DeepSeek V4 the MoE class name is something like
|
||||
# `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
|
||||
try:
|
||||
from llmcompressor.modeling import replace_modules_for_calibration
|
||||
print("Replacing MoE modules for calibration...")
|
||||
replace_modules_for_calibration(model)
|
||||
except ImportError:
|
||||
print("WARN: replace_modules_for_calibration not available in this "
|
||||
"llm-compressor version. Routed-only experts may not see "
|
||||
"calibration data, lowering NVFP4 quality on rare experts.")
|
||||
except Exception as e:
|
||||
print(f"WARN: replace_modules_for_calibration failed: {e}")
|
||||
print(" You may need to register a custom MoE wrapper for V4. "
|
||||
"Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "
|
||||
"register it via llmcompressor.modeling.register_module_replacement.")
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 3. Calibration dataset
|
||||
# ----------------------------------------------------------------------
|
||||
print(f"Loading calibration dataset {args.calibration_dataset} ...")
|
||||
ds = load_dataset(args.calibration_dataset, split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(args.num_samples))
|
||||
|
||||
def preprocess(example):
|
||||
# Use the model's chat template if it has one; ultrachat samples have a
|
||||
# 'messages' field already in the OpenAI shape.
|
||||
if "messages" in example:
|
||||
try:
|
||||
text = tokenizer.apply_chat_template(
|
||||
example["messages"], tokenize=False, add_generation_prompt=False
|
||||
)
|
||||
except Exception:
|
||||
text = "\n".join(m.get("content", "") for m in example["messages"])
|
||||
else:
|
||||
text = example.get("text") or example.get("prompt") or ""
|
||||
return {"text": text}
|
||||
|
||||
ds = ds.map(preprocess, remove_columns=ds.column_names)
|
||||
|
||||
def tokenize(example):
|
||||
return tokenizer(
|
||||
example["text"],
|
||||
truncation=True,
|
||||
max_length=args.max_seq_len,
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
|
||||
ds = ds.map(tokenize, remove_columns=["text"])
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 4. Recipe
|
||||
# ----------------------------------------------------------------------
|
||||
# NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:
|
||||
# output head, embeddings, MoE router gates (NOT gate_proj!), norms, and
|
||||
# V4-specific attention indexer / mHC residual mixing weights.
|
||||
ignore = [
|
||||
"re:.*lm_head",
|
||||
"re:.*embed_tokens$",
|
||||
"re:.*\\.mlp\\.gate$",
|
||||
"re:.*\\.mlp\\.gate\\.weight$",
|
||||
"re:.*norm.*",
|
||||
"re:.*indexer.*",
|
||||
"re:.*hyper_conn.*",
|
||||
"re:.*\\.mhc.*",
|
||||
"re:.*scoring.*",
|
||||
]
|
||||
|
||||
if args.no_activation_quant:
|
||||
print("Recipe: NVFP4 weight-only (W4A16 effective)")
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear",
|
||||
scheme="NVFP4A16", # weight-only variant
|
||||
ignore=ignore,
|
||||
)
|
||||
else:
|
||||
print("Recipe: NVFP4 W4A4 with activation calibration")
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear",
|
||||
scheme="NVFP4",
|
||||
ignore=ignore,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 5. Run oneshot — sequential pipeline is the key for memory
|
||||
# ----------------------------------------------------------------------
|
||||
print("Starting oneshot calibration + quantization (this is the long part)...")
|
||||
print(f" num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")
|
||||
print(f" Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")
|
||||
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=args.max_seq_len,
|
||||
num_calibration_samples=args.num_samples,
|
||||
# Sequential pipeline: one block at a time on GPU, rest on CPU.
|
||||
pipeline="sequential",
|
||||
# Calibrate every expert, even routed-only ones that wouldn't see traffic.
|
||||
moe_calibrate_all_experts=True,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# 6. Save compressed
|
||||
# ----------------------------------------------------------------------
|
||||
print(f"Saving compressed checkpoint to {dst} ...")
|
||||
dst.mkdir(parents=True, exist_ok=True)
|
||||
model.save_pretrained(str(dst), save_compressed=True)
|
||||
tokenizer.save_pretrained(str(dst))
|
||||
|
||||
# Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)
|
||||
import shutil
|
||||
for fname in src.iterdir():
|
||||
if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:
|
||||
dst_sub = dst / fname.name
|
||||
if not dst_sub.exists():
|
||||
shutil.copytree(fname, dst_sub)
|
||||
elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():
|
||||
shutil.copy2(fname, dst / fname.name)
|
||||
|
||||
print("Done.")
|
||||
print(f"Output: {dst}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
compressed-tensors<0.15.0
|
||||
nvidia-modelopt[hf]
|
||||
fire
|
||||
flash-attn>=2.6.0
|
||||
transformers<5.0
|
||||
transformers_stream_generator
|
||||
zstandard
|
||||
276
scripts/dequant_fp8_to_bf16.py
Normal file
276
scripts/dequant_fp8_to_bf16.py
Normal file
@@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete dequantization of DeepSeek V4 Pro mixed-precision to pure BF16.
|
||||
|
||||
Handles ALL compressed tensor types found in the mixed-precision model:
|
||||
|
||||
1. FP8 attention weights (float8_e4m3fn + float8_e8m0fnu block scales)
|
||||
- weight × scale_expanded → BF16
|
||||
- 128×128 block quantization
|
||||
|
||||
2. FP4 (E2M1) expert weights (int8 packed + float8_e8m0fnu block scales)
|
||||
- Unpack 2 FP4 values per int8 byte (lower nibble first, upper second)
|
||||
- Dequantize via E2M1 LUT lookup × scale_expanded → BF16
|
||||
- Per-row, 32-column block scaling (MXFP4 microscaling format)
|
||||
- Output dimensions are 2× the stored dimensions
|
||||
- Verified: nibble index 0 vs 8 ratio = 0.996 (FP4 -0.0 vs +0.0),
|
||||
NOT INT4 where index 8 = -8 would be rare
|
||||
|
||||
3. FP8 shared expert weights (float8_e4m3fn + float8_e8m0fnu block scales)
|
||||
- Same as FP8 attention dequantization
|
||||
|
||||
After dequantization, all weights are pure BF16. FP8Linear.forward() sees
|
||||
element_size() > 1 and falls back to F.linear(), avoiding broken FP8 kernels
|
||||
on Blackwell GPUs. The model can then be loaded by modelopt without shape
|
||||
mismatches.
|
||||
"""
|
||||
|
||||
import os, glob, json, shutil, sys, time
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
import torch
|
||||
|
||||
FP8_WEIGHT_DTYPE = torch.float8_e4m3fn
|
||||
FP8_SCALE_DTYPE = torch.float8_e8m0fnu
|
||||
BLOCK_SIZE_FP8 = (128, 128)
|
||||
FP4_BLOCK_SIZE = 32 # columns per scale value for MXFP4 expert weights
|
||||
|
||||
# E2M1 FP4 lookup table (MXFP4 microscaling format)
|
||||
# Index 0-7: positive values (sign=0, 2-bit exp, 1-bit mantissa)
|
||||
# Index 8-15: negative values (sign=1)
|
||||
# Mapping: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
|
||||
FP4_E2M1_LUT = torch.tensor([
|
||||
0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
|
||||
-0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
|
||||
], dtype=torch.float32)
|
||||
|
||||
|
||||
def dequantize_fp8_weight(fp8_weight: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
||||
"""Dequantize block-wise FP8 weight to BF16.
|
||||
|
||||
fp8_weight: (out_features, in_features) float8_e4m3fn
|
||||
scale: (out_features//128, in_features//128) float8_e8m0fnu
|
||||
"""
|
||||
scale_f32 = scale.float()
|
||||
out_features, in_features = fp8_weight.shape
|
||||
scale_expanded = scale_f32.repeat_interleave(BLOCK_SIZE_FP8[0], dim=0).repeat_interleave(BLOCK_SIZE_FP8[1], dim=1)
|
||||
scale_expanded = scale_expanded[:out_features, :in_features]
|
||||
weight_bf16 = fp8_weight.float() * scale_expanded
|
||||
return weight_bf16.to(torch.bfloat16)
|
||||
|
||||
|
||||
def dequantize_fp4_weight(int8_packed: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
||||
"""Dequantize MXFP4 (E2M1) expert weight to BF16.
|
||||
|
||||
FP4 values are packed 2-per-byte into int8 tensors.
|
||||
Lower nibble (bits 0-3) is the first value, upper nibble (bits 4-7) is the second.
|
||||
E2M1 format: 1 sign + 2 exponent + 1 mantissa bit.
|
||||
|
||||
Scale is per-row with 32-column blocks (float8_e8m0fnu, MX microscaling).
|
||||
Output dimensions are 2× the stored dimensions.
|
||||
|
||||
int8_packed: (out_features, in_features//2) int8
|
||||
scale: (out_features, in_features//32) float8_e8m0fnu
|
||||
returns: (out_features, in_features) bfloat16
|
||||
"""
|
||||
lut = FP4_E2M1_LUT.to(int8_packed.device)
|
||||
|
||||
# Unpack nibble indices
|
||||
lower_idx = (int8_packed & 0x0F).long() # 0-15
|
||||
upper_idx = ((int8_packed >> 4) & 0x0F).long() # 0-15
|
||||
|
||||
# LUT lookup
|
||||
lower = lut[lower_idx] # float32
|
||||
upper = lut[upper_idx] # float32
|
||||
|
||||
out_features = int8_packed.shape[0]
|
||||
in_features_full = int8_packed.shape[1] * 2 # 2× expansion
|
||||
|
||||
# Expand scale: (out_features, in_features//32) → (out_features, in_features)
|
||||
scale_f32 = scale.float()
|
||||
scale_expanded = scale_f32.repeat_interleave(FP4_BLOCK_SIZE, dim=1)
|
||||
scale_expanded = scale_expanded[:, :in_features_full]
|
||||
|
||||
# Interleave lower and upper nibbles
|
||||
unpacked = torch.empty(out_features, in_features_full, dtype=torch.float32, device=int8_packed.device)
|
||||
unpacked[:, 0::2] = lower
|
||||
unpacked[:, 1::2] = upper
|
||||
|
||||
# Dequantize: FP4 value × E8M0 scale
|
||||
bf16_weight = (unpacked * scale_expanded).to(torch.bfloat16)
|
||||
return bf16_weight
|
||||
|
||||
|
||||
def dequantize_model(model_dir: str, out_dir: str):
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
# Copy non-safetensor files
|
||||
print("Copying metadata files...")
|
||||
for f in os.listdir(model_dir):
|
||||
fp = os.path.join(model_dir, f)
|
||||
if not f.endswith(".safetensors") and os.path.isfile(fp):
|
||||
shutil.copy2(fp, os.path.join(out_dir, f))
|
||||
print(f" Copied {f}")
|
||||
|
||||
safetensor_files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
|
||||
total_shards = len(safetensor_files)
|
||||
print(f"Found {total_shards} shards")
|
||||
|
||||
# First pass: build scale-key → weight-key mapping
|
||||
print("\nScanning for weight+scale pairs...")
|
||||
scale_to_weight = {}
|
||||
for f in safetensor_files:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
if key.endswith(".scale"):
|
||||
weight_key = key[:-len(".scale")] + ".weight"
|
||||
scale_to_weight[key] = weight_key
|
||||
|
||||
weight_to_scale = {v: k for k, v in scale_to_weight.items()}
|
||||
print(f"Found {len(scale_to_weight)} weight+scale pairs")
|
||||
|
||||
# Classify weights by type (sample first 2 shards)
|
||||
fp4_weight_keys = set()
|
||||
fp8_weight_keys = set()
|
||||
scale_keys = set(scale_to_weight.keys())
|
||||
|
||||
for f in safetensor_files[:2]:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
if key in weight_to_scale:
|
||||
t = sf.get_tensor(key)
|
||||
if t.dtype == torch.int8:
|
||||
fp4_weight_keys.add(key)
|
||||
elif t.dtype == FP8_WEIGHT_DTYPE:
|
||||
fp8_weight_keys.add(key)
|
||||
|
||||
print(f" FP4 (E2M1) expert weights (packed): ~{len(fp4_weight_keys)} per shard")
|
||||
print(f" FP8 attention/shared-expert weights: ~{len(fp8_weight_keys)} per shard")
|
||||
|
||||
# Second pass: dequantize and save
|
||||
stats = {"fp4_dequantized": 0, "fp8_dequantized": 0, "scales_removed": 0, "unchanged": 0}
|
||||
start_time = time.time()
|
||||
|
||||
for i, f in enumerate(safetensor_files):
|
||||
shard_start = time.time()
|
||||
tensors = {}
|
||||
scales_in_shard = {}
|
||||
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
keys = list(sf.keys())
|
||||
|
||||
# First: collect scales
|
||||
for key in keys:
|
||||
if key in scale_keys:
|
||||
t = sf.get_tensor(key)
|
||||
scales_in_shard[key] = t
|
||||
|
||||
# Second: process weights and other tensors
|
||||
for key in keys:
|
||||
if key in scale_keys:
|
||||
continue # handled separately
|
||||
|
||||
t = sf.get_tensor(key)
|
||||
|
||||
if key in weight_to_scale and t.dtype == torch.int8:
|
||||
# FP4 (E2M1) packed expert weight (MXFP4 microscaling)
|
||||
scale_key = weight_to_scale[key]
|
||||
scale = scales_in_shard.get(scale_key)
|
||||
if scale is None:
|
||||
print(f" WARNING: scale {scale_key} not in same shard as {key}")
|
||||
tensors[key] = t # keep as-is
|
||||
continue
|
||||
bf16 = dequantize_fp4_weight(t, scale)
|
||||
tensors[key] = bf16
|
||||
stats["fp4_dequantized"] += 1
|
||||
del scales_in_shard[scale_key]
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
elif key in weight_to_scale and t.dtype == FP8_WEIGHT_DTYPE:
|
||||
# FP8 weight (attention or shared expert)
|
||||
scale_key = weight_to_scale[key]
|
||||
scale = scales_in_shard.get(scale_key)
|
||||
if scale is None:
|
||||
print(f" WARNING: scale {scale_key} not in same shard as {key}")
|
||||
tensors[key] = t
|
||||
continue
|
||||
bf16 = dequantize_fp8_weight(t, scale)
|
||||
tensors[key] = bf16
|
||||
stats["fp8_dequantized"] += 1
|
||||
del scales_in_shard[scale_key]
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
else:
|
||||
# Regular tensor (BF16, FP32, int64, etc.) - keep as-is
|
||||
tensors[key] = t
|
||||
stats["unchanged"] += 1
|
||||
|
||||
# Remove unused scales
|
||||
for sk in scales_in_shard:
|
||||
stats["scales_removed"] += 1
|
||||
|
||||
out_path = os.path.join(out_dir, os.path.basename(f))
|
||||
if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
|
||||
# Resume: skip already-dequantized shards
|
||||
print(f"[{i+1}/{total_shards}] Skipping (already done): {os.path.basename(f)}")
|
||||
del tensors, scales_in_shard
|
||||
continue
|
||||
save_file(tensors, out_path)
|
||||
|
||||
shard_time = time.time() - shard_start
|
||||
elapsed = time.time() - start_time
|
||||
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
||||
eta = (total_shards - i - 1) / rate if rate > 0 else 0
|
||||
|
||||
print(f"[{i+1}/{total_shards}] {os.path.basename(f)} "
|
||||
f"({stats['fp4_dequantized']} fp4, {stats['fp8_dequantized']} fp8, "
|
||||
f"{stats['scales_removed']} scales rm) "
|
||||
f"[{shard_time:.1f}s, ETA: {eta/60:.0f}min]")
|
||||
|
||||
del tensors, scales_in_shard
|
||||
|
||||
# Update config
|
||||
cfg_path = os.path.join(out_dir, "config.json")
|
||||
if os.path.exists(cfg_path):
|
||||
cfg = json.load(open(cfg_path))
|
||||
cfg["torch_dtype"] = "bfloat16"
|
||||
cfg["_experts_implementation"] = "eager"
|
||||
if "quantization_config" in cfg:
|
||||
del cfg["quantization_config"]
|
||||
json.dump(cfg, open(cfg_path, "w"), indent=2)
|
||||
print(f"\nUpdated config.json: torch_dtype=bfloat16, _experts_implementation=eager")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
print(f"\nDone in {total_time/60:.1f} minutes!")
|
||||
print(f" FP4 expert weights dequantized: {stats['fp4_dequantized']}")
|
||||
print(f" FP8 weights dequantized: {stats['fp8_dequantized']}")
|
||||
print(f" Scale tensors removed: {stats['scales_removed']}")
|
||||
print(f" Unchanged tensors: {stats['unchanged']}")
|
||||
|
||||
# Verify no FP8/INT8 remaining
|
||||
print("\nVerifying...")
|
||||
remaining_compressed = 0
|
||||
for f in sorted(glob.glob(os.path.join(out_dir, "*.safetensors")))[:5]:
|
||||
with safe_open(f, framework="pt") as sf:
|
||||
for key in sf.keys():
|
||||
t = sf.get_tensor(key)
|
||||
if t.dtype in (torch.float8_e8m0fnu, torch.float8_e4m3fn, torch.int8):
|
||||
remaining_compressed += 1
|
||||
if remaining_compressed <= 5:
|
||||
print(f" REMAINING: {key} {t.dtype} {t.shape}")
|
||||
if remaining_compressed == 0:
|
||||
print(" ✅ No compressed tensors remaining — model is pure BF16!")
|
||||
else:
|
||||
print(f" ⚠️ {remaining_compressed} compressed tensors still present")
|
||||
|
||||
out_size = sum(os.path.getsize(os.path.join(out_dir, f)) for f in os.listdir(out_dir) if f.endswith(".safetensors"))
|
||||
print(f"Output size: {out_size / 1e12:.2f} TB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Complete dequantization of DeepSeek V4 Pro to BF16")
|
||||
parser.add_argument("model_dir", help="Path to mixed-precision model")
|
||||
parser.add_argument("out_dir", help="Path to write dequantized BF16 model")
|
||||
args = parser.parse_args()
|
||||
dequantize_model(args.model_dir, args.out_dir)
|
||||
587
scripts/quantize_nvfp4.py
Normal file
587
scripts/quantize_nvfp4.py
Normal file
@@ -0,0 +1,587 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DeepSeek V4 Pro → NVFP4 quantization — defensive edition.
|
||||
|
||||
This script:
|
||||
1. Applies runtime patches for GPU tensor safety (before modelopt runs)
|
||||
2. Calls the SAME hf_ptq.py pipeline that the shell script uses
|
||||
3. After calibration, snapshots amax to CPU and saves model state
|
||||
|
||||
The key insight: we don't rewrite the pipeline. We let hf_ptq do its thing
|
||||
with all its args, defaults, and edge cases handled correctly. We just add
|
||||
our defensive patches and post-calibration saves.
|
||||
|
||||
Must be run from the modelopt example directory:
|
||||
cd /root/nvidia-meeting/modelopt-repo/examples/llm_ptq
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
|
||||
|
||||
Usage:
|
||||
# Full run (calibrate + export):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py
|
||||
|
||||
# Re-run export only (after a calibration save exists):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --export-only
|
||||
|
||||
# Validate saved calibration state (check amax values):
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/quantize_nvfp4.py --validate-only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-BF16"
|
||||
QUANT = "nvfp4"
|
||||
TP = 8
|
||||
CALIB_SIZE = 128
|
||||
CALIB_SEQ = 512
|
||||
KV_CACHE_QUANT = "fp8_cast"
|
||||
GPU_MEM_PCT = 0.7
|
||||
|
||||
HF_TOKEN = "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"
|
||||
|
||||
# Paths
|
||||
EXAMPLE_DIR = "/root/nvidia-meeting/modelopt-repo/examples/llm_ptq"
|
||||
EXPORT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||||
CALIB_SAVE_PATH = "/root/nvidia-meeting/v4_nvfp4_calibrated_state.pt"
|
||||
AMAX_SNAPSHOT_PATH = "/root/nvidia-meeting/v4_nvfp4_amax_snapshots.pt"
|
||||
|
||||
|
||||
def apply_patches():
|
||||
"""Apply runtime patches for V4 compatibility and GPU tensor safety.
|
||||
|
||||
Root cause of all export crashes: use_seq_device_map keeps model weights on GPU
|
||||
for 5+ hours during calibration. By export time, CUDA's memory allocator has
|
||||
recycled the underlying memory, so any read of those GPU tensors triggers
|
||||
cudaErrorIllegalAddress.
|
||||
|
||||
Fix strategy: patch at the EARLIEST possible entry points to force stale GPU
|
||||
tensors to CPU before any downstream code reads them. This covers the full
|
||||
chain of execution we traced through the export path:
|
||||
|
||||
_process_quantized_modules
|
||||
→ _export_quantized_weight (or _export_fused_experts)
|
||||
→ get_weight_scaling_factor
|
||||
→ get_weights_scaling_factor_from_quantizer (reads weight, _amax, global_amax)
|
||||
→ NVFP4QTensor.get_weights_scaling_factor (dynamic: reduce_block_amax on weight)
|
||||
→ get_weight_scaling_factor_2 (reads _amax, global_amax)
|
||||
→ get_activation_scaling_factor (reads _amax) [already patched]
|
||||
→ to_quantized_weight (reads weight, does .to(weight.device) on scaling factors)
|
||||
→ weight.to(dtype) (reads weight)
|
||||
|
||||
By forcing weight to CPU in Patch 4 (_export_quantized_weight), ALL downstream
|
||||
.to(weight.device) calls resolve to CPU. Patches 5-8 are belt-and-suspenders.
|
||||
"""
|
||||
|
||||
from modelopt.torch.quantization.nn.modules import tensor_quantizer as tq_module
|
||||
from modelopt.torch.quantization.qtensor import nvfp4_tensor
|
||||
from modelopt.torch.export import quant_utils
|
||||
from modelopt.torch.quantization.utils import quantizer_attr_names as _quantizer_attr_names
|
||||
import modelopt.torch.export.unified_export_hf as uehf
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 1: load_calib_amax — force _amax to CPU immediately after calibration
|
||||
# This runs during calibration, right after each quantizer finishes.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_load_calib_amax = tq_module.TensorQuantizer.load_calib_amax
|
||||
|
||||
def patched_load_calib_amax(self, *args, **kwargs):
|
||||
orig_load_calib_amax(self, *args, **kwargs)
|
||||
if hasattr(self, '_amax') and self._amax is not None:
|
||||
self._amax = self._amax.cpu()
|
||||
|
||||
tq_module.TensorQuantizer.load_calib_amax = patched_load_calib_amax
|
||||
print("✓ Patch 1: TensorQuantizer.load_calib_amax → force _amax to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 2: export_amax — CPU safety net at export time
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_amax = tq_module.TensorQuantizer.export_amax
|
||||
|
||||
def patched_export_amax(self):
|
||||
if hasattr(self, '_amax') and self._amax is not None and self._amax.is_cuda:
|
||||
self._amax = self._amax.cpu()
|
||||
return orig_export_amax(self)
|
||||
|
||||
tq_module.TensorQuantizer.export_amax = patched_export_amax
|
||||
print("✓ Patch 2: TensorQuantizer.export_amax → CPU fallback")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 3: get_activation_scaling_factor — CPU + clamp
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
@classmethod
|
||||
def patched_get_activation_scaling_factor(cls, quantizer):
|
||||
if not quantizer.is_enabled:
|
||||
return None
|
||||
try:
|
||||
amax = quantizer.export_amax()
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: export_amax() failed ({e}), attempting CPU recovery...")
|
||||
if hasattr(quantizer, '_amax') and quantizer._amax is not None:
|
||||
quantizer._amax = quantizer._amax.cpu()
|
||||
amax = quantizer.export_amax()
|
||||
|
||||
if amax is None:
|
||||
return None
|
||||
amax = amax.cpu()
|
||||
activation_scaling_factor = amax.float() / (quantizer.maxbound * 448.0)
|
||||
|
||||
if not torch.all(activation_scaling_factor > 0):
|
||||
n_bad = (activation_scaling_factor <= 0).sum().item()
|
||||
n_total = activation_scaling_factor.numel()
|
||||
print(f" WARNING: {n_bad}/{n_total} activation scaling factors <= 0, clamping")
|
||||
activation_scaling_factor = activation_scaling_factor.clamp(min=torch.finfo(torch.float32).tiny)
|
||||
|
||||
return activation_scaling_factor
|
||||
|
||||
nvfp4_tensor.NVFP4QTensor.get_activation_scaling_factor = patched_get_activation_scaling_factor
|
||||
print("✓ Patch 3: NVFP4QTensor.get_activation_scaling_factor → CPU + clamp")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 4: _export_quantized_weight — THE KEY PATCH
|
||||
#
|
||||
# This is the entry point for exporting each quantized module. It reads
|
||||
# `weight = getattr(sub_module, weight_name)` which is on a stale GPU.
|
||||
# By moving weight to CPU right here, ALL downstream functions are safe:
|
||||
# - get_weight_scaling_factor: weight.device is now CPU
|
||||
# - get_weights_scaling_factor: operates on CPU weight
|
||||
# - to_quantized_weight: .to(weight.device) stays on CPU
|
||||
# - weight.to(dtype): CPU cast
|
||||
# We also force all quantizer state to CPU for the same reason.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_quantized_weight = uehf._export_quantized_weight
|
||||
|
||||
def patched_export_quantized_weight(sub_module, dtype, weight_name="weight"):
|
||||
# Move weight to CPU (stale GPU → safe CPU)
|
||||
weight = getattr(sub_module, weight_name, None)
|
||||
if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
try:
|
||||
weight_cpu = weight.cpu()
|
||||
with torch.no_grad():
|
||||
setattr(sub_module, weight_name, torch.nn.Parameter(weight_cpu))
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: weight.cpu() failed for {weight_name} ({e})")
|
||||
raise
|
||||
|
||||
# Force all quantizer state to CPU
|
||||
qattrs = _quantizer_attr_names(weight_name)
|
||||
for qattr in [qattrs.weight_quantizer, qattrs.input_quantizer, qattrs.output_quantizer]:
|
||||
if not qattr:
|
||||
continue
|
||||
quantizer = getattr(sub_module, qattr, None)
|
||||
if quantizer is None:
|
||||
continue
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Handle SequentialQuantizer (W4A8 path)
|
||||
if hasattr(quantizer, 'quantizers'):
|
||||
for sub_q in quantizer.quantizers:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(sub_q, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(sub_q, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
|
||||
return orig_export_quantized_weight(sub_module, dtype, weight_name)
|
||||
|
||||
uehf._export_quantized_weight = patched_export_quantized_weight
|
||||
print("✓ Patch 4: _export_quantized_weight → force weight + quantizer state to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 5: _export_fused_experts — same treatment for MoE expert weights
|
||||
# DeepseekV4Experts go through this different code path.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_export_fused_experts = uehf._export_fused_experts
|
||||
|
||||
def patched_export_fused_experts(sub_module, dtype):
|
||||
# Force all expert weights to CPU
|
||||
for name, param in list(sub_module.named_parameters()):
|
||||
if isinstance(param, torch.Tensor) and param.is_cuda:
|
||||
try:
|
||||
with torch.no_grad():
|
||||
setattr(sub_module, name, torch.nn.Parameter(param.cpu()))
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Force all buffers to CPU
|
||||
for name, buf in list(sub_module.named_buffers()):
|
||||
if isinstance(buf, torch.Tensor) and buf.is_cuda:
|
||||
try:
|
||||
sub_module.register_buffer(name, buf.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
# Force all quantizer state to CPU
|
||||
for mod in sub_module.modules():
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(mod, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(mod, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_export_fused_experts(sub_module, dtype)
|
||||
|
||||
uehf._export_fused_experts = patched_export_fused_experts
|
||||
print("✓ Patch 5: _export_fused_experts → force expert weights + quantizer state to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 6: to_quantized_weight — force scaling factors to CPU
|
||||
# This does .to(weight.device) on scaling factors. With weight now on
|
||||
# CPU (Patch 4), this should be a no-op, but belt-and-suspenders.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_to_quantized_weight = quant_utils.to_quantized_weight
|
||||
|
||||
def patched_to_quantized_weight(weight, weights_scaling_factor, quantization,
|
||||
weights_scaling_factor2=None, block_size=None):
|
||||
if isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
weight = weight.cpu()
|
||||
if weights_scaling_factor is not None and isinstance(weights_scaling_factor, torch.Tensor) and weights_scaling_factor.is_cuda:
|
||||
weights_scaling_factor = weights_scaling_factor.cpu()
|
||||
if weights_scaling_factor2 is not None and isinstance(weights_scaling_factor2, torch.Tensor) and weights_scaling_factor2.is_cuda:
|
||||
weights_scaling_factor2 = weights_scaling_factor2.cpu()
|
||||
return orig_to_quantized_weight(weight, weights_scaling_factor, quantization,
|
||||
weights_scaling_factor2, block_size)
|
||||
|
||||
quant_utils.to_quantized_weight = patched_to_quantized_weight
|
||||
print("✓ Patch 6: to_quantized_weight → force all tensors to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 7: get_weight_scaling_factor — force weight + quantizer to CPU
|
||||
# Belt and suspenders: Patch 4 should handle this, but this is also
|
||||
# called from other code paths.
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_get_weight_scaling_factor = quant_utils.get_weight_scaling_factor
|
||||
|
||||
def patched_get_weight_scaling_factor(module, weight_name="weight"):
|
||||
weight = getattr(module, weight_name, None)
|
||||
if weight is not None and isinstance(weight, torch.Tensor) and weight.is_cuda:
|
||||
try:
|
||||
with torch.no_grad():
|
||||
setattr(module, weight_name, torch.nn.Parameter(weight.cpu()))
|
||||
except (torch.cuda.CudaError, RuntimeError) as e:
|
||||
print(f" WARNING: weight.cpu() failed in get_weight_scaling_factor ({e})")
|
||||
raise
|
||||
weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
|
||||
if weight_quantizer is not None:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(weight_quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(weight_quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_get_weight_scaling_factor(module, weight_name)
|
||||
|
||||
quant_utils.get_weight_scaling_factor = patched_get_weight_scaling_factor
|
||||
print("✓ Patch 7: get_weight_scaling_factor → force weight + quantizer to CPU")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Patch 8: get_weight_scaling_factor_2 — force quantizer to CPU
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
orig_get_weight_scaling_factor_2 = quant_utils.get_weight_scaling_factor_2
|
||||
|
||||
def patched_get_weight_scaling_factor_2(module, weight_name="weight"):
|
||||
weight_quantizer = getattr(module, _quantizer_attr_names(weight_name).weight_quantizer, None)
|
||||
if weight_quantizer is not None:
|
||||
for attr in ['_amax', '_pre_quant_scale', 'global_amax', '_global_amax']:
|
||||
val = getattr(weight_quantizer, attr, None)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
try:
|
||||
setattr(weight_quantizer, attr, val.cpu())
|
||||
except (torch.cuda.CudaError, RuntimeError):
|
||||
pass
|
||||
return orig_get_weight_scaling_factor_2(module, weight_name)
|
||||
|
||||
quant_utils.get_weight_scaling_factor_2 = patched_get_weight_scaling_factor_2
|
||||
print("✓ Patch 8: get_weight_scaling_factor_2 → force quantizer to CPU")
|
||||
|
||||
|
||||
def snapshot_amax_to_cpu(model, snapshot_path):
|
||||
"""Walk all quantizers, copy _amax to CPU, save to disk."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
print(f"\nSnapshotting quantizer _amax to CPU...")
|
||||
t0 = time.time()
|
||||
snapshots = {}
|
||||
n_moved = 0
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
if hasattr(module, '_amax') and module._amax is not None:
|
||||
amax_cpu = module._amax.detach().cpu().clone()
|
||||
snapshots[name] = amax_cpu
|
||||
module._amax.data.copy_(amax_cpu)
|
||||
n_moved += 1
|
||||
|
||||
torch.save(snapshots, snapshot_path)
|
||||
size_mb = os.path.getsize(snapshot_path) / (1024**2)
|
||||
print(f"✓ Snapshotted {n_moved} quantizer _amax tensors to CPU ({time.time()-t0:.1f}s)")
|
||||
print(f" Saved to: {snapshot_path} ({size_mb:.1f} MB)")
|
||||
return snapshots
|
||||
|
||||
|
||||
def restore_amax_from_snapshot(model, snapshot_path):
|
||||
"""Restore _amax from a previously saved CPU snapshot."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
print(f"Restoring _amax from snapshot: {snapshot_path}")
|
||||
snapshots = torch.load(snapshot_path, map_location='cpu')
|
||||
n_restored = 0
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
if name in snapshots and hasattr(module, '_amax'):
|
||||
module._amax.data.copy_(snapshots[name].to(module._amax.device))
|
||||
n_restored += 1
|
||||
|
||||
print(f"✓ Restored {n_restored} _amax tensors from snapshot")
|
||||
|
||||
|
||||
def force_all_amax_to_cpu(model):
|
||||
"""Force ALL quantizer tensors to CPU."""
|
||||
from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer
|
||||
|
||||
count = 0
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, TensorQuantizer):
|
||||
continue
|
||||
for attr in ['_amax', '_pre_quant_scale', '_global_amax']:
|
||||
if hasattr(module, attr):
|
||||
val = getattr(module, attr)
|
||||
if val is not None and isinstance(val, torch.Tensor) and val.is_cuda:
|
||||
setattr(module, attr, val.cpu())
|
||||
count += 1
|
||||
print(f"✓ Forced {count} quantizer tensors to CPU")
|
||||
|
||||
|
||||
def save_calibrated_state(model, path):
|
||||
"""Save model state dict after calibration."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SAVING CALIBRATED STATE → {path}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
start = time.time()
|
||||
state = {
|
||||
'model_state_dict': model.state_dict(),
|
||||
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
torch.save(state, path)
|
||||
size_gb = os.path.getsize(path) / (1024**3)
|
||||
print(f"✓ Saved calibrated state: {size_gb:.1f} GB ({time.time()-start:.0f}s)")
|
||||
print(f" Path: {path}")
|
||||
print(f" Re-run with --export-only to retry export.\n")
|
||||
|
||||
|
||||
def run_calibration(model_path, export_dir, calib_save_path, amax_snapshot_path, calib_size, calib_seq):
|
||||
"""Full pipeline: parse args via hf_ptq → load → quantize → snapshot → save → export."""
|
||||
|
||||
os.chdir(EXAMPLE_DIR)
|
||||
sys.path.insert(0, EXAMPLE_DIR)
|
||||
|
||||
os.environ["HF_TOKEN"] = HF_TOKEN
|
||||
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
||||
|
||||
from hf_ptq import parse_args, main as hf_main
|
||||
|
||||
apply_patches()
|
||||
|
||||
# ── Build args using hf_ptq's own parser ──
|
||||
# This guarantees ALL attributes exist with correct defaults.
|
||||
# We temporarily replace sys.argv so parse_args() sees our config.
|
||||
saved_argv = sys.argv
|
||||
sys.argv = [
|
||||
"hf_ptq.py",
|
||||
"--pyt_ckpt_path", model_path,
|
||||
"--qformat", QUANT,
|
||||
"--calib_size", str(calib_size),
|
||||
"--calib_seq", str(calib_seq),
|
||||
"--kv_cache_qformat", KV_CACHE_QUANT,
|
||||
"--inference_tensor_parallel", str(TP),
|
||||
"--export_path", export_dir,
|
||||
"--trust_remote_code",
|
||||
"--use_seq_device_map",
|
||||
"--gpu_max_mem_percentage", str(GPU_MEM_PCT),
|
||||
"--batch_size", "0",
|
||||
]
|
||||
args = parse_args()
|
||||
sys.argv = saved_argv
|
||||
|
||||
# Apply the same post-parse conversions that hf_ptq's __main__ block does
|
||||
# (these normally run between parse_args() and main() in the original script,
|
||||
# but since we call main() directly, we have to do them ourselves)
|
||||
args.dataset = args.dataset.split(",") if isinstance(args.dataset, str) else args.dataset
|
||||
args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
|
||||
|
||||
# ── Post-calibration hook ──
|
||||
# We monkey-patch export_quantized to add our defensive saves before export.
|
||||
import hf_ptq
|
||||
|
||||
orig_export_quantized = hf_ptq.export_quantized
|
||||
|
||||
def patched_export_quantized(exp_args, full_model, language_model, model_type,
|
||||
tokenizer, default_padding_side, default_pad_token):
|
||||
"""Wrapper that snapshots amax and saves state before calling the real export."""
|
||||
print("\n" + "="*60)
|
||||
print("POST-CALIBRATION: Snapshotting amax and saving state")
|
||||
print("="*60)
|
||||
|
||||
# Snapshot amax to CPU
|
||||
snapshot_amax_to_cpu(language_model, amax_snapshot_path)
|
||||
|
||||
# Force all quantizer state to CPU
|
||||
force_all_amax_to_cpu(language_model)
|
||||
|
||||
# Free GPU memory
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
# Save calibrated state
|
||||
save_calibrated_state(language_model, calib_save_path)
|
||||
|
||||
# Now run the real export
|
||||
orig_export_quantized(exp_args, full_model, language_model, model_type,
|
||||
tokenizer, default_padding_side, default_pad_token)
|
||||
|
||||
hf_ptq.export_quantized = patched_export_quantized
|
||||
print("✓ Hooked export_quantized with amax snapshot + state save")
|
||||
|
||||
# ── Run hf_ptq's full pipeline ──
|
||||
# This handles model loading, quantization, calibration, and export
|
||||
# using the exact same code path as the shell script.
|
||||
hf_main(args)
|
||||
|
||||
|
||||
def run_export_only(calib_save_path, amax_snapshot_path, model_path, export_dir):
|
||||
"""Load saved calibration state and run export only."""
|
||||
|
||||
os.chdir(EXAMPLE_DIR)
|
||||
sys.path.insert(0, EXAMPLE_DIR)
|
||||
|
||||
os.environ["HF_TOKEN"] = HF_TOKEN
|
||||
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
||||
|
||||
apply_patches()
|
||||
|
||||
from example_utils import get_model, get_tokenizer
|
||||
|
||||
print(f"Loading model from {model_path}...")
|
||||
model = get_model(
|
||||
model_path,
|
||||
device="cpu",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer = get_tokenizer(model_path, trust_remote_code=True)
|
||||
|
||||
print(f"Loading calibrated state from {calib_save_path}...")
|
||||
state = torch.load(calib_save_path, map_location='cpu')
|
||||
model.load_state_dict(state['model_state_dict'])
|
||||
print(f"✓ Loaded calibrated state (saved at {state['timestamp']})")
|
||||
|
||||
force_all_amax_to_cpu(model)
|
||||
if amax_snapshot_path and os.path.exists(amax_snapshot_path):
|
||||
restore_amax_from_snapshot(model, amax_snapshot_path)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
from modelopt.torch.export import export_hf_checkpoint
|
||||
from hf_ptq import load_mtp_weights, copy_custom_model_files
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"EXPORTING → {export_dir}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(model, model_path)
|
||||
if mtp_layer_prefixes:
|
||||
model._mtp_layer_prefixes = mtp_layer_prefixes
|
||||
|
||||
export_hf_checkpoint(model, export_dir=export_dir, extra_state_dict=mtp_state_dict)
|
||||
tokenizer.save_pretrained(export_dir)
|
||||
copy_custom_model_files(model_path, export_dir, True)
|
||||
print(f"\n✓ Export complete in {time.time()-t0:.0f}s → {export_dir}")
|
||||
except Exception as e:
|
||||
print(f"\n✗ EXPORT FAILED: {e}")
|
||||
print(f" Calibrated state: {CALIB_SAVE_PATH}")
|
||||
print(f" Amax snapshots: {AMAX_SNAPSHOT_PATH}")
|
||||
raise
|
||||
|
||||
|
||||
def run_validate(calib_save_path, amax_snapshot_path):
|
||||
"""Validate saved calibration state — check amax values are valid."""
|
||||
print(f"\nValidating calibration state...")
|
||||
|
||||
if os.path.exists(amax_snapshot_path):
|
||||
snapshots = torch.load(amax_snapshot_path, map_location='cpu')
|
||||
n_total = len(snapshots)
|
||||
n_valid = n_zero = n_nan = n_neg = 0
|
||||
|
||||
for name, amax in snapshots.items():
|
||||
if torch.any(torch.isnan(amax)):
|
||||
n_nan += 1
|
||||
elif torch.any(amax < 0):
|
||||
n_neg += 1
|
||||
elif torch.all(amax == 0):
|
||||
n_zero += 1
|
||||
else:
|
||||
n_valid += 1
|
||||
|
||||
print(f"\nAmax snapshot validation:")
|
||||
print(f" Total: {n_total} Valid: {n_valid} Zero: {n_zero} Neg: {n_neg} NaN: {n_nan}")
|
||||
if n_valid == n_total:
|
||||
print(f"\n✓ All {n_total} amax snapshots are valid!")
|
||||
else:
|
||||
print(f"\n✗ {n_total - n_valid} quantizers have invalid amax!")
|
||||
else:
|
||||
print(f"✗ No amax snapshot found at {amax_snapshot_path}")
|
||||
|
||||
if os.path.exists(calib_save_path):
|
||||
size_gb = os.path.getsize(calib_save_path) / (1024**3)
|
||||
print(f"\nCalibrated state: {calib_save_path} ({size_gb:.1f} GB)")
|
||||
else:
|
||||
print(f"\n✗ No calibrated state found at {calib_save_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="DeepSeek V4 Pro NVFP4 Quantization")
|
||||
parser.add_argument("--export-only", action="store_true",
|
||||
help="Skip calibration, load saved state and run export only")
|
||||
parser.add_argument("--validate-only", action="store_true",
|
||||
help="Validate saved calibration state without running anything")
|
||||
parser.add_argument("--model", default=MODEL, help="Path to BF16 model")
|
||||
parser.add_argument("--export-dir", default=EXPORT_DIR, help="Export output directory")
|
||||
parser.add_argument("--calib-save", default=CALIB_SAVE_PATH, help="Calibration state save path")
|
||||
parser.add_argument("--amax-snapshot", default=AMAX_SNAPSHOT_PATH, help="Amax snapshot path")
|
||||
parser.add_argument("--calib-size", type=int, default=CALIB_SIZE, help="Calibration samples")
|
||||
parser.add_argument("--calib-seq", type=int, default=CALIB_SEQ, help="Calibration sequence length")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.validate_only:
|
||||
run_validate(args.calib_save, args.amax_snapshot)
|
||||
elif args.export_only:
|
||||
if not os.path.exists(args.calib_save):
|
||||
print(f"ERROR: No calibration state found at {args.calib_save}")
|
||||
sys.exit(1)
|
||||
run_export_only(args.calib_save, args.amax_snapshot, args.model, args.export_dir)
|
||||
else:
|
||||
run_calibration(args.model, args.export_dir, args.calib_save,
|
||||
args.amax_snapshot, args.calib_size, args.calib_seq)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
scripts/serve_vllm.py
Normal file
100
scripts/serve_vllm.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DeepSeek V4 Pro NVFP4 — vLLM OpenAI-compatible server.
|
||||
|
||||
Run from the venv on the B200 node:
|
||||
source /root/nvidia-meeting/venv/bin/activate
|
||||
python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py
|
||||
|
||||
Or in the background:
|
||||
nohup python3 /root/nvidia-meeting/deepseek-v4-quant/scripts/serve_vllm.py \
|
||||
> /root/nvidia-meeting/vllm_serve.log 2>&1 &
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# ── Patch: Add compress_ratios to DeepseekV4Config ──────────────────────────
|
||||
# transformers 5.8.0 renamed compress_ratios → compress_rates (dict format).
|
||||
# vllm 0.20.2 still expects compress_ratios as a list indexed by layer_id.
|
||||
# We patch the Config class to expose compress_ratios as a property that
|
||||
# converts the new dict format back to the list format vllm expects.
|
||||
import transformers
|
||||
try:
|
||||
from transformers import DeepseekV4Config
|
||||
|
||||
_orig_init = DeepseekV4Config.__init__
|
||||
|
||||
def _patched_init(self, *args, **kwargs):
|
||||
_orig_init(self, *args, **kwargs)
|
||||
# If compress_ratios already exists as a list, leave it alone
|
||||
if hasattr(self, 'compress_ratios') and isinstance(self.compress_ratios, list):
|
||||
return
|
||||
# Convert compress_rates dict → compress_ratios list
|
||||
if hasattr(self, 'compress_rates') and isinstance(self.compress_rates, dict):
|
||||
rates = self.compress_rates
|
||||
# Build per-layer list from the dict schema
|
||||
# V4 pattern: layers 0-1=128, then alternating 4/128, last=0
|
||||
n_layers = getattr(self, 'num_hidden_layers', 61)
|
||||
cr = rates.get('compressed_sparse_attention', 4)
|
||||
hr = rates.get('heavily_compressed_attention', 128)
|
||||
ratios = []
|
||||
for i in range(n_layers):
|
||||
if i < 2:
|
||||
ratios.append(hr)
|
||||
elif i == n_layers - 1:
|
||||
ratios.append(0)
|
||||
else:
|
||||
ratios.append(cr if i % 2 == 0 else hr)
|
||||
self.compress_ratios = ratios
|
||||
elif hasattr(self, 'compress_rates') and isinstance(self.compress_rates, list):
|
||||
self.compress_ratios = self.compress_rates
|
||||
|
||||
DeepseekV4Config.__init__ = _patched_init
|
||||
print("✓ Patched DeepseekV4Config.__init__ to add compress_ratios")
|
||||
except ImportError:
|
||||
print("⚠ DeepseekV4Config not found, skipping compress_ratios patch")
|
||||
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||||
|
||||
# These flags are critical for V4 — do not change without understanding why:
|
||||
# --trust-remote-code V4 needs custom modeling code
|
||||
# --kv-cache-dtype fp8 Match our kv_cache_qformat=fp8_cast quantization
|
||||
# --block-size 256 V4 recommended block size
|
||||
# --enable-expert-parallel Distribute expert computation across GPUs (critical for 256-expert MoE)
|
||||
# --tensor-parallel-size 8 8× B200
|
||||
# --compilation-config CUDA graphs for throughput — FULL_AND_PIECEWISE + all custom ops
|
||||
# --attention_config FP4 indexer cache for V4 MLA attention
|
||||
# --moe-backend deep_gemm_mega_moe — optimized MoE kernel for Blackwell
|
||||
# --tokenizer-mode deepseek_v4 — V4-specific tokenizer
|
||||
# --tool-call-parser deepseek_v4 — native tool calling
|
||||
# --enable-auto-tool-choice Auto tool choice for function calling
|
||||
# --reasoning-parser deepseek_v4 — reasoning/thinking output parsing
|
||||
# --speculative_config MTP speculative decoding (2 speculative tokens)
|
||||
|
||||
cmd = [
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
|
||||
"--model", MODEL,
|
||||
"--trust-remote-code",
|
||||
"--kv-cache-dtype", "fp8",
|
||||
"--block-size", "256",
|
||||
"--enable-expert-parallel",
|
||||
"--tensor-parallel-size", "8",
|
||||
"--compilation-config", '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}',
|
||||
"--attention_config.use_fp4_indexer_cache=True",
|
||||
"--moe-backend", "deep_gemm_mega_moe", # WARN: No NVFP4 mega_moe kernel. Use docker-compose (omits this flag) instead.
|
||||
"--tokenizer-mode", "deepseek_v4",
|
||||
"--tool-call-parser", "deepseek_v4",
|
||||
"--enable-auto-tool-choice",
|
||||
"--reasoning-parser", "deepseek_v4",
|
||||
"--speculative_config", '{"method":"mtp","num_speculative_tokens":2}',
|
||||
"--host", "0.0.0.0",
|
||||
"--port", "8000",
|
||||
]
|
||||
|
||||
print(f"Starting vLLM server for {MODEL}")
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print(f"Log: /root/nvidia-meeting/vllm_serve.log")
|
||||
print()
|
||||
|
||||
sys.exit(subprocess.call(cmd))
|
||||
262
tmp/apply_all_fixes.py
Normal file
262
tmp/apply_all_fixes.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/python3
|
||||
"""
|
||||
Apply ALL fixes to the S11 base version of deepseek_v4.py.
|
||||
This is a clean application of all fixes we've developed.
|
||||
"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
import ast
|
||||
|
||||
def check(c, label):
|
||||
try:
|
||||
ast.parse(c)
|
||||
print(f" {label}: OK")
|
||||
return True
|
||||
except SyntaxError as e:
|
||||
print(f" {label}: SYNTAX ERROR at line {e.lineno}: {e.msg}")
|
||||
return False
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 1: Substr mapping — remove .mla_attn. from attn projections
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
subs = {
|
||||
'".self_attn.q_a_proj.": ".attn.mla_attn.wq_a."': '".self_attn.q_a_proj.": ".attn.wq_a."',
|
||||
'".self_attn.q_b_proj.": ".attn.mla_attn.wq_b."': '".self_attn.q_b_proj.": ".attn.wq_b."',
|
||||
'".self_attn.q_a_norm.": ".attn.mla_attn.q_norm."': '".self_attn.q_a_norm.": ".attn.q_norm."',
|
||||
'".self_attn.o_a_proj.": ".attn.mla_attn.wo_a."': '".self_attn.o_a_proj.": ".attn.wo_a."',
|
||||
'".self_attn.o_b_proj.": ".attn.mla_attn.wo_b."': '".self_attn.o_b_proj.": ".attn.wo_b."',
|
||||
'".self_attn.sinks": ".attn.mla_attn.attn_sink"': '".self_attn.sinks": ".attn.attn_sink"',
|
||||
'".self_attn.kv_proj.": ".attn.mla_attn.wkv."': '".self_attn.kv_proj.": ".attn.wkv."',
|
||||
'".self_attn.kv_norm.": ".attn.mla_attn.kv_norm."': '".self_attn.kv_norm.": ".attn.kv_norm."',
|
||||
}
|
||||
for old, new in subs.items():
|
||||
c = c.replace(old, new)
|
||||
check(c, "Fix 1 (substr)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 2: Skip patterns — only skip compressor scales
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Remove attention and shared expert scale skip patterns
|
||||
lines_to_remove = [
|
||||
' re.compile(r"\\.self_attn\\.kv_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_a_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_b_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_a_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_b_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.kv_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_a_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_b_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_a_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_b_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.kv_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_a_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.q_b_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_a_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.self_attn\\.o_b_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.gate_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.up_proj\\.weight_scale$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.gate_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.up_proj\\.weight_scale_2$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.gate_proj\\.input_scale$"): None,',
|
||||
' re.compile(r"\\.shared_experts\\.up_proj\\.input_scale$"): None,',
|
||||
]
|
||||
for line in lines_to_remove:
|
||||
c = c.replace(line + "\n", "")
|
||||
c = c.replace(line, "")
|
||||
check(c, "Fix 2 (skip patterns)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 3: Remove the 'head.weight' suffix mapping that causes
|
||||
# 'lm_head.weight' to become 'lm_lm_head.weight'
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
c = c.replace(' "head.weight": "lm_head.weight",\n', '')
|
||||
check(c, "Fix 3 (suffix)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 4: Handle o_a_proj bf16 -> FP8 at load time
|
||||
# modelopt didn't quantize o_a_proj, but vLLM creates wo_a with NVFP4
|
||||
# Convert bf16 -> FP8 and set weight_scale_inv
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
old_else = ''' else:
|
||||
if name not in params_dict:
|
||||
# ModelOpt NVFP4 export includes params not in the
|
||||
# vllm model (e.g., compressor.position_bias).
|
||||
# Skip them silently.
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(
|
||||
param, "weight_loader", default_weight_loader
|
||||
)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
continue'''
|
||||
|
||||
new_else = ''' else:
|
||||
if name not in params_dict:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
|
||||
# Handle o_a_proj bf16 -> wo_a uint8 mismatch
|
||||
if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# o_a_proj was NOT quantized by modelopt (bf16, no scales)
|
||||
# Convert bf16 -> FP8 and set weight_scale_inv
|
||||
w_bf16 = loaded_weight
|
||||
w_amax = w_bf16.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=w_bf16.device)
|
||||
fp8_max = torch.finfo(torch.float8_e4m3fn).max
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_bf16 / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
# Replace the module weight and add weight_scale_inv
|
||||
parts = name.rsplit(".", 1)
|
||||
module_path = parts[0]
|
||||
mod = self
|
||||
for attr in module_path.split("."):
|
||||
if attr.isdigit():
|
||||
mod = mod[int(attr)]
|
||||
else:
|
||||
mod = getattr(mod, attr)
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
loaded_params.add(name)
|
||||
loaded_params.add(name.replace(".weight", ".weight_scale_inv"))
|
||||
continue
|
||||
|
||||
weight_loader = getattr(
|
||||
param, "weight_loader", default_weight_loader
|
||||
)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
continue'''
|
||||
|
||||
c = c.replace(old_else, new_else)
|
||||
check(c, "Fix 4 (o_a_proj bf16->FP8)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 5: Add NVFP4->FP8 post-load conversion for attention
|
||||
# This converts all uint8 NVFP4 attention weights to FP8
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
conversion_methods = '''
|
||||
def _convert_nvfp4_attention_to_fp8(self):
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
|
||||
shared_expert_names = {"gate_up_proj"}
|
||||
converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, "shared_experts"):
|
||||
for proj_name in shared_expert_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
if converted > 0:
|
||||
logger.info_once(
|
||||
"Converted %d NVFP4 attention/shared-expert layers to FP8",
|
||||
converted,
|
||||
)
|
||||
|
||||
def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
even_idx = (w_uint8 & 0x0F).int()
|
||||
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
|
||||
even_vals = e2m1_lut.to(device)[even_idx]
|
||||
odd_vals = e2m1_lut.to(device)[odd_idx]
|
||||
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
|
||||
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = (
|
||||
mod.input_scale.data.max().item()
|
||||
if hasattr(mod, "input_scale")
|
||||
else 1.0
|
||||
)
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
w_amax = w_dequant.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=device)
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
|
||||
'''
|
||||
|
||||
# Insert before DeepseekV4ForCausalLM class
|
||||
marker = "\n\nclass DeepseekV4ForCausalLM(nn.Module):"
|
||||
if marker in c:
|
||||
c = c.replace(marker, "\n" + conversion_methods + "\nclass DeepseekV4ForCausalLM(nn.Module):")
|
||||
print(" Fix 5: Inserted conversion methods")
|
||||
else:
|
||||
print(" Fix 5: Could not find class marker")
|
||||
|
||||
check(c, "Fix 5 (NVFP4->FP8 methods)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# FIX 6: Call the conversion from DeepseekV4ForCausalLM.load_weights
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
old_load = " self.model.finalize_mega_moe_weights()\n return loaded_params"
|
||||
new_load = " self.model.finalize_mega_moe_weights()\n self.model._convert_nvfp4_attention_to_fp8()\n return loaded_params"
|
||||
c = c.replace(old_load, new_load)
|
||||
check(c, "Fix 6 (call conversion)")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Final validation
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
check(c, "FINAL")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
print("All fixes applied!")
|
||||
190
tmp/apply_fixes.py
Normal file
190
tmp/apply_fixes.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Apply all NVFP4 serving fixes to deepseek_v4.py"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# FIX 1: Substr mapping — remove .mla_attn. from attention projections
|
||||
# The model has fused_wqa_wkv, wq_b, wo_a, wo_b at attn.* level
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
replacements_1 = {
|
||||
'".self_attn.q_a_proj.": ".attn.mla_attn.wq_a."': '".self_attn.q_a_proj.": ".attn.wq_a."',
|
||||
'".self_attn.q_b_proj.": ".attn.mla_attn.wq_b."': '".self_attn.q_b_proj.": ".attn.wq_b."',
|
||||
'".self_attn.q_a_norm.": ".attn.mla_attn.q_norm."': '".self_attn.q_a_norm.": ".attn.q_norm."',
|
||||
'".self_attn.o_a_proj.": ".attn.mla_attn.wo_a."': '".self_attn.o_a_proj.": ".attn.wo_a."',
|
||||
'".self_attn.o_b_proj.": ".attn.mla_attn.wo_b."': '".self_attn.o_b_proj.": ".attn.wo_b."',
|
||||
'".self_attn.sinks": ".attn.mla_attn.attn_sink"': '".self_attn.sinks": ".attn.attn_sink"',
|
||||
'".self_attn.kv_proj.": ".attn.mla_attn.wkv."': '".self_attn.kv_proj.": ".attn.wkv."',
|
||||
'".self_attn.kv_norm.": ".attn.mla_attn.kv_norm."': '".self_attn.kv_norm.": ".attn.kv_norm."',
|
||||
}
|
||||
for old, new in replacements_1.items():
|
||||
if old in c:
|
||||
c = c.replace(old, new)
|
||||
print(f" Fixed: {old[:50]}... → {new[:50]}...")
|
||||
else:
|
||||
print(f" NOT FOUND: {old[:60]}...")
|
||||
|
||||
# Update comment
|
||||
c = c.replace(
|
||||
'# Attention: self_attn → attn.mla_attn',
|
||||
'# Attention: self_attn → attn (projections at attn level, not mla_attn)'
|
||||
)
|
||||
print("FIX 1 applied: substr mappings updated\n")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# FIX 2: Skip patterns — only skip compressor scale tensors
|
||||
# Attention and shared expert scale tensors now correctly load
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
old_skip_block = ''' fused_skip_regex = {
|
||||
# Compressor projections → fused_wkv_wgate (stacked)
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.input_scale$"): None,
|
||||
# Attention projections → fused_wqa_wkv (stacked)
|
||||
re.compile(r"\\.self_attn\\.kv_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_a_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_b_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_a_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_b_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.kv_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_a_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_b_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_a_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_b_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.self_attn\\.kv_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_a_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.q_b_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_a_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.self_attn\\.o_b_proj\\.input_scale$"): None,
|
||||
# Shared expert gate_proj/up_proj → gate_up_proj (stacked)
|
||||
re.compile(r"\\.shared_experts\\.gate_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.shared_experts\\.up_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.shared_experts\\.gate_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.shared_experts\\.up_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.shared_experts\\.gate_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.shared_experts\\.up_proj\\.input_scale$"): None,
|
||||
}'''
|
||||
|
||||
new_skip_block = ''' fused_skip_regex = {
|
||||
# Compressor projections → fused_wkv_wgate (stacked)
|
||||
# Compressor uses UnquantizedLinearMethod (quant_config=None),
|
||||
# so it only has a bf16 weight param — no scale params registered.
|
||||
# We unpack the NVFP4 uint8 weights to bf16 at load time.
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.weight_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.weight_scale_2$"): None,
|
||||
re.compile(r"\\.compressor\\.kv_proj\\.input_scale$"): None,
|
||||
re.compile(r"\\.compressor\\.gate_proj\\.input_scale$"): None,
|
||||
# Note: attention and shared expert scale tensors are NO LONGER
|
||||
# skipped. After fixing substr mappings, they correctly map to the
|
||||
# model's NVFP4 scale parameters (fused_wqa_wkv, wq_b, wo_a,
|
||||
# wo_b, gate_up_proj). They load via the stacking logic.
|
||||
}'''
|
||||
|
||||
if old_skip_block in c:
|
||||
c = c.replace(old_skip_block, new_skip_block)
|
||||
print("FIX 2 applied: skip patterns updated (only compressor scales skipped)\n")
|
||||
else:
|
||||
print("FIX 2: Could not find old skip block, searching for alternatives...")
|
||||
# Try a more flexible search
|
||||
import re
|
||||
# Find the fused_skip_regex block
|
||||
m = re.search(r' fused_skip_regex = \{[^}]+\}', c)
|
||||
if m:
|
||||
print(f" Found block at position {m.start()}")
|
||||
else:
|
||||
print(" Could not find fused_skip_regex block!")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# FIX 3: Handle o_a_proj bf16 → wo_a uint8 mismatch
|
||||
# modelopt didn't quantize o_a_proj (bf16, no scales).
|
||||
# When loading bf16 into uint8, replace the layer's quant_method
|
||||
# with UnquantizedLinearMethod so it runs in bf16.
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
old_else_block = ''' else:
|
||||
if name not in params_dict:
|
||||
# ModelOpt NVFP4 export includes params not in the
|
||||
# vllm model (e.g., compressor.position_bias).
|
||||
# Skip them silently.
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(
|
||||
param, "weight_loader", default_weight_loader
|
||||
)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
continue'''
|
||||
|
||||
new_else_block = ''' else:
|
||||
if name not in params_dict:
|
||||
# ModelOpt NVFP4 export includes params not in the
|
||||
# vllm model (e.g., compressor.position_bias).
|
||||
# Skip them silently.
|
||||
continue
|
||||
param = params_dict[name]
|
||||
|
||||
# Handle bf16 → uint8 mismatch for o_a_proj:
|
||||
# modelopt didn't quantize o_a_proj (bf16, no scales),
|
||||
# but ModelOptNvFp4Config creates wo_a with NVFP4 quant
|
||||
# (uint8 weight + scales). When loading bf16 into uint8,
|
||||
# we replace the quant method with UnquantizedLinearMethod
|
||||
# so the layer runs in bf16 at inference.
|
||||
if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# Replace this layer's quant method with unquantized
|
||||
from vllm.model_executor.layers.linear import (
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
parts = name.rsplit(".", 1)
|
||||
module_path = parts[0] # e.g., layers.0.attn.wo_a
|
||||
# Find the module and override its quant method
|
||||
mod = self
|
||||
for attr in module_path.split("."):
|
||||
if attr.isdigit():
|
||||
mod = mod[int(attr)]
|
||||
else:
|
||||
mod = getattr(mod, attr)
|
||||
if hasattr(mod, 'quant_method'):
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
# Replace the uint8 weight param with bf16
|
||||
new_shape = list(loaded_weight.shape)
|
||||
new_param = torch.nn.Parameter(
|
||||
loaded_weight.clone(), requires_grad=False
|
||||
)
|
||||
mod.weight = new_param
|
||||
# Remove scale params (they'll stay at init values,
|
||||
# but the UnquantizedLinearMethod won't use them)
|
||||
loaded_params.add(name)
|
||||
continue
|
||||
|
||||
weight_loader = getattr(
|
||||
param, "weight_loader", default_weight_loader
|
||||
)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
continue'''
|
||||
|
||||
if old_else_block in c:
|
||||
c = c.replace(old_else_block, new_else_block)
|
||||
print("FIX 3 applied: bf16→uint8 mismatch handling for o_a_proj\n")
|
||||
else:
|
||||
print("FIX 3: Could not find exact else block, trying flexible match...")
|
||||
import re
|
||||
m = re.search(r'(\s+else:\n\s+if name not in params_dict:.*?continue\n\s+continue)', c, re.DOTALL)
|
||||
if m:
|
||||
print(f" Found block at position {m.start()}")
|
||||
else:
|
||||
print(" Could not find else block!")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
print("\nAll fixes written to", filepath)
|
||||
46
tmp/fix10_quant_method.py
Normal file
46
tmp/fix10_quant_method.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fix the FP8 conversion to use a simple no-op quant method for attention layers."""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Replace all instances of Fp8LinearMethod/Fp8MMQuantMethod imports and usage
|
||||
# with a simpler approach: just set quant_method to None and handle it
|
||||
|
||||
# In _convert_nvfp4_module_to_fp8
|
||||
old_fp8_convert = ''' # Switch quant method to FP8 linear
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
Fp8MMQuantMethod,
|
||||
)
|
||||
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''
|
||||
|
||||
new_fp8_convert = ''' # Switch quant method to a no-op. The attention forward uses
|
||||
# deepseek_v4_fp8_einsum directly (not the quant method), so the
|
||||
# quant method is irrelevant. We just need process_weights_after_loading
|
||||
# to not crash. Using UnquantizedLinearMethod as a safe no-op.
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()'''
|
||||
|
||||
c = c.replace(old_fp8_convert, new_fp8_convert)
|
||||
|
||||
# In the bf16->uint8 handler (o_a_proj case)
|
||||
old_oa_fp8 = ''' # Switch quant method to FP8 linear
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
Fp8MMQuantMethod,
|
||||
)
|
||||
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''
|
||||
|
||||
new_oa_fp8 = ''' # Switch quant method to no-op (attention forward bypasses it)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()'''
|
||||
|
||||
c = c.replace(old_oa_fp8, new_oa_fp8)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
print("Replaced Fp8LinearMethod with UnquantizedLinearMethod for attention")
|
||||
162
tmp/fix5_nvfp4.py
Normal file
162
tmp/fix5_nvfp4.py
Normal file
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fix the bf16→uint8 handler to properly quantize to NVFP4 instead of switching to UnquantizedLinearMethod"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
old_handler = ''' # Handle bf16 → uint8 mismatch for o_a_proj:
|
||||
# modelopt didn't quantize o_a_proj (bf16, no scales),
|
||||
# but ModelOptNvFp4Config creates wo_a with NVFP4 quant
|
||||
# (uint8 weight + scales). When loading bf16 into uint8,
|
||||
# we replace the quant method with UnquantizedLinearMethod
|
||||
# so the layer runs in bf16 at inference.
|
||||
if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# Replace this layer's quant method with unquantized
|
||||
from vllm.model_executor.layers.linear import (
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
parts = name.rsplit(".", 1)
|
||||
module_path = parts[0] # e.g., layers.0.attn.wo_a
|
||||
# Find the module and override its quant method
|
||||
mod = self
|
||||
for attr in module_path.split("."):
|
||||
if attr.isdigit():
|
||||
mod = mod[int(attr)]
|
||||
else:
|
||||
mod = getattr(mod, attr)
|
||||
if hasattr(mod, 'quant_method'):
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
# Replace the uint8 weight param with bf16
|
||||
new_param = torch.nn.Parameter(
|
||||
loaded_weight.clone(), requires_grad=False
|
||||
)
|
||||
mod.weight = new_param
|
||||
# Set weight_scale_inv = 1.0 (required by
|
||||
# DeepseekV4MLAModules forward pass which
|
||||
# reads wo_a.weight_scale_inv directly)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
torch.tensor(1.0, dtype=torch.float32),
|
||||
requires_grad=False,
|
||||
)
|
||||
# Also set input_scale to prevent missing attr errors
|
||||
if hasattr(mod, 'input_scale'):
|
||||
mod.input_scale = torch.nn.Parameter(
|
||||
torch.tensor(1.0, dtype=torch.float32),
|
||||
requires_grad=False,
|
||||
)
|
||||
loaded_params.add(name)
|
||||
loaded_params.add(name.replace('.weight', '.weight_scale_inv'))
|
||||
continue'''
|
||||
|
||||
new_handler = ''' # Handle bf16 → uint8 mismatch for o_a_proj:
|
||||
# modelopt didn't quantize o_a_proj (bf16, no scales),
|
||||
# but ModelOptNvFp4Config creates wo_a with NVFP4 quant
|
||||
# (uint8 weight + scales). We quantize the bf16 weight
|
||||
# to NVFP4 at load time so the layer runs in NVFP4 path.
|
||||
if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# Quantize bf16 → NVFP4 (E2M1 packed uint8 + scales)
|
||||
w_bf16 = loaded_weight
|
||||
out_dim, in_dim = w_bf16.shape
|
||||
block_size = 16
|
||||
assert in_dim % block_size == 0
|
||||
n_blocks = in_dim // block_size
|
||||
|
||||
# Reshape into blocks
|
||||
w_blocks = w_bf16.reshape(out_dim, n_blocks, block_size)
|
||||
|
||||
# Compute per-block amax
|
||||
amax = w_blocks.abs().amax(dim=-1) # [out, n_blocks]
|
||||
|
||||
# Global scale (weight_scale_2): max amax / (6.0 * 448.0)
|
||||
global_amax = amax.max()
|
||||
# Use 448.0 as the max e4m3 value for scale computation
|
||||
weight_scale_2_val = global_amax / (6.0 * 448.0)
|
||||
weight_scale_2 = weight_scale_2_val.to(torch.float32)
|
||||
|
||||
# Per-block scale (weight_scale): fp8 e4m3
|
||||
# block_scale = amax / (6.0 * weight_scale_2)
|
||||
block_scale = amax / (6.0 * weight_scale_2_val)
|
||||
# Clamp to fp8 e4m3 range and cast
|
||||
block_scale = block_scale.clamp(min=0, max=448.0)
|
||||
weight_scale = block_scale.to(torch.float8_e4m3fn)
|
||||
|
||||
# Quantize to FP4 (E2M1)
|
||||
# E2M1 LUT: 0, 0.5, 1, 1.5, 2, 3, 4, 6 (positive)
|
||||
FP4_POS = torch.tensor(
|
||||
[0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0],
|
||||
dtype=torch.float32, device=w_bf16.device,
|
||||
)
|
||||
# For each block, dequantize the block scale from fp8
|
||||
block_scale_f32 = weight_scale.to(torch.float32)
|
||||
# Scale the weight values: normalized = w / (block_scale * weight_scale_2)
|
||||
# We need to find the nearest FP4 value
|
||||
scaled = w_blocks / (block_scale_f32.unsqueeze(-1) * weight_scale_2_val)
|
||||
# Find nearest FP4 index (0-7 for magnitude)
|
||||
# Use absolute value for matching, then apply sign
|
||||
scaled_abs = scaled.abs()
|
||||
# Find closest FP4 value
|
||||
diff = (scaled_abs.unsqueeze(-1) - FP4_POS).abs()
|
||||
fp4_idx = diff.argmin(dim=-1) # [out, n_blocks, block_size]
|
||||
# Apply sign: negative values get bit 3 set
|
||||
sign = (scaled < 0).int()
|
||||
fp4_val = (sign << 3) | fp4_idx.int()
|
||||
# Pack: 2 FP4 values per uint8 byte
|
||||
# Even positions → lower nibble, Odd → upper nibble
|
||||
fp4_flat = fp4_val.reshape(out_dim, -1) # [out, in_dim]
|
||||
assert fp4_flat.shape[1] % 2 == 0
|
||||
even = fp4_flat[:, 0::2] # lower nibble
|
||||
odd = fp4_flat[:, 1::2] # upper nibble
|
||||
packed = (odd << 4) | even
|
||||
weight_packed = packed.to(torch.uint8)
|
||||
|
||||
# Reshape weight_scale to [out, n_blocks]
|
||||
weight_scale_2d = weight_scale.reshape(out_dim, n_blocks)
|
||||
|
||||
# Load the quantized weight into the uint8 param
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, weight_packed)
|
||||
loaded_params.add(name)
|
||||
|
||||
# Load scales into sibling params
|
||||
base = name.rsplit(".", 1)[0]
|
||||
# weight_scale
|
||||
ws_name = f"{base}.weight_scale"
|
||||
if ws_name in params_dict:
|
||||
ws_param = params_dict[ws_name]
|
||||
ws_loader = getattr(ws_param, "weight_loader", default_weight_loader)
|
||||
ws_loader(ws_param, weight_scale_2d)
|
||||
loaded_params.add(ws_name)
|
||||
# weight_scale_2
|
||||
ws2_name = f"{base}.weight_scale_2"
|
||||
if ws2_name in params_dict:
|
||||
ws2_param = params_dict[ws2_name]
|
||||
ws2_loader = getattr(ws2_param, "weight_loader", default_weight_loader)
|
||||
ws2_loader(ws2_param, weight_scale_2.reshape(1))
|
||||
loaded_params.add(ws2_name)
|
||||
# input_scale: use 1.0 default (dynamic quant)
|
||||
is_name = f"{base}.input_scale"
|
||||
if is_name in params_dict:
|
||||
is_param = params_dict[is_name]
|
||||
is_loader = getattr(is_param, "weight_loader", default_weight_loader)
|
||||
is_loader(is_param, torch.tensor(1.0, dtype=torch.float32))
|
||||
loaded_params.add(is_name)
|
||||
continue'''
|
||||
|
||||
if old_handler in c:
|
||||
c = c.replace(old_handler, new_handler)
|
||||
print('FIX 5 applied: Replaced UnquantizedLinearMethod with proper NVFP4 quantization')
|
||||
else:
|
||||
print('FIX 5: Could not find exact handler block, trying flexible match...')
|
||||
if 'UnquantizedLinearMethod' in c:
|
||||
print(' Found UnquantizedLinearMethod in code - manual fix needed')
|
||||
else:
|
||||
print(' UnquantizedLinearMethod not found - already replaced?')
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
81
tmp/fix6_nvfp4_to_fp8.py
Normal file
81
tmp/fix6_nvfp4_to_fp8.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Replace the current NVFP4 weight handling with a comprehensive
|
||||
NVFP4→bf16→FP8 re-quantization path for attention layers.
|
||||
|
||||
The vLLM DeepSeekV4 attention forward code uses deepseek_v4_fp8_einsum
|
||||
which requires FP8 weights + weight_scale_inv. NVFP4 weights (uint8 packed
|
||||
FP4 + per-block scales + per-tensor scales) are incompatible with this kernel.
|
||||
|
||||
Solution: At load time, dequantize all NVFP4 attention weights to bf16,
|
||||
then re-quantize to FP8. Store the FP8 weights + weight_scale_inv.
|
||||
The existing FP8 attention forward code then works without modification.
|
||||
|
||||
For compressor fused_wkv_wgate: stays bf16 (UnquantizedLinearMethod).
|
||||
For MoE experts: handled by ModelOptNvFp4FusedMoE natively.
|
||||
For shared experts gate_up_proj: also needs FP8 conversion.
|
||||
"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# ============================================================
|
||||
# Helper function: add the NVFP4→FP8 conversion utility
|
||||
# at the top of the load_weights method
|
||||
# ============================================================
|
||||
|
||||
old_load_weights_start = ''' def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
# Define E2M1 FP4 → BF16 lookup table for unpacking
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)'''
|
||||
|
||||
new_load_weights_start = ''' def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
# Define E2M1 FP4 → BF16 lookup table for unpacking
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# ── NVFP4 → FP8 re-quantization helper ──────────────────
|
||||
# The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
|
||||
# which requires FP8 weights + weight_scale_inv. ModelOpt NVFP4
|
||||
# stores uint8 packed FP4 + per-block/per-tensor scales.
|
||||
# We dequantize NVFP4→bf16, then re-quantize to FP8 at load time.
|
||||
def _nvfp4_to_fp8(w_uint8, w_scale, w_scale_2, w_input_scale):
|
||||
"""Convert NVFP4 weight + scales to FP8 weight + weight_scale_inv."""
|
||||
# Unpack uint8 → E2M1 FP4 → bf16
|
||||
even = (w_uint8 & 0x0F).int()
|
||||
odd = ((w_uint8 >> 4) & 0x0F).int()
|
||||
# Interleave back
|
||||
bf16_even = E2M1_LUT.to(w_uint8.device)[even]
|
||||
bf16_odd = E2M1_LUT.to(w_uint8.device)[odd]
|
||||
# Stack along last dim and flatten
|
||||
w_bf16 = torch.stack([bf16_even, bf16_odd], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1) # [out, in_dim]
|
||||
|
||||
# Dequantize: bf16_val = fp4 * block_scale * global_scale * input_scale
|
||||
if w_scale.dim() == 2:
|
||||
block_scale = w_scale.to(torch.float32).unsqueeze(-1) # [out, blocks, 1]
|
||||
w_bf16_scaled = w_bf16.float() * block_scale.reshape(
|
||||
w_bf16.shape[0], -1) * w_scale_2.item() * w_input_scale.item()
|
||||
else:
|
||||
w_bf16_scaled = w_bf16.float() * w_scale_2.item() * w_input_scale.item()
|
||||
w_bf16_scaled = w_bf16_scaled.to(torch.bfloat16)
|
||||
|
||||
# Re-quantize bf16 → FP8 e4m3
|
||||
w_amax = w_bf16_scaled.abs().amax()
|
||||
fp8_scale = w_amax / torch.finfo(torch.float8_e4m3fn).max
|
||||
w_fp8 = (w_bf16_scaled / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
return w_fp8, weight_scale_inv
|
||||
# ── End helper ──────────────────────────────────────────
|
||||
'''
|
||||
|
||||
content = content.replace(old_load_weights_start, new_load_weights_start)
|
||||
print("Added NVFP4→FP8 helper function")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(content)
|
||||
print("Written to file")
|
||||
88
tmp/fix7_stacked.py
Normal file
88
tmp/fix7_stacked.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Update the load_weights method to use NVFP4→FP8 conversion for attention layers.
|
||||
|
||||
Key changes:
|
||||
1. Stacked params (fused_wqa_wkv): when uint8, dequantize to bf16, re-quantize to FP8,
|
||||
then load as FP8 weight + weight_scale_inv
|
||||
2. Non-stacked params (wq_b, wo_a, wo_b, gate_up_proj): same treatment
|
||||
3. Compressor fused_wkv_wgate: stays as bf16 (E2M1 unpack only)
|
||||
4. Remove the separate bf16→uint8 handler (no longer needed since we go to FP8)
|
||||
"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# ============================================================
|
||||
# Replace the stacked params loading block
|
||||
# The current code unpacks uint8→bf16 for MergedColumnParallelLinear
|
||||
# We need to instead convert NVFP4→FP8 for attention/shared expert
|
||||
# and unpack→bf16 for compressor
|
||||
# ============================================================
|
||||
|
||||
old_stacked_unpack = ''' # ModelOpt NVFP4 packed weight fix for MergedColumnParallelLinear.
|
||||
#
|
||||
# modelopt exports NVFP4 packed weights as uint8 (2 values/byte
|
||||
# along the column dim). But MergedColumnParallelLinear creates
|
||||
# weight as bf16 (not PackedColumnParameter uint8) because
|
||||
# ModelOptNvFp4Config only handles Linear, not
|
||||
# MergedColumnParallelLinear.
|
||||
#
|
||||
# For compressor fused_wkv_wgate (quant_config=None →
|
||||
# UnquantizedLinearMethod → bf16 weight):
|
||||
# Unpack uint8→bf16 using E2M1 LUT, load into bf16 param.
|
||||
#
|
||||
# For fused_wqa_wkv (NVFP4 quant method → uint8 weight):
|
||||
# The weight param IS uint8, so no unpacking needed.
|
||||
# Just load the packed uint8 weight directly.
|
||||
# Scales are loaded separately (no longer skipped).
|
||||
if (loaded_weight.dtype == torch.uint8
|
||||
and param.data.dtype != torch.uint8
|
||||
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
|
||||
# Unpack NVFP4 (E2M1) → BF16
|
||||
# E2M1 LUT: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
|
||||
even_idx = (loaded_weight & 0x0F).int()
|
||||
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
|
||||
even_vals = E2M1_LUT[even_idx]
|
||||
odd_vals = E2M1_LUT[odd_idx]
|
||||
# Interleave even and odd along the last dim
|
||||
out = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
out = out.reshape(
|
||||
loaded_weight.shape[0], -1
|
||||
).to(torch.bfloat16)
|
||||
loaded_weight = out'''
|
||||
|
||||
new_stacked_unpack = ''' # ModelOpt NVFP4 weight handling for stacked params.
|
||||
#
|
||||
# The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
|
||||
# which requires FP8 weights + weight_scale_inv. NVFP4 weights are
|
||||
# incompatible. We convert NVFP4→bf16→FP8 at load time.
|
||||
#
|
||||
# For compressor fused_wkv_wgate (UnquantizedLinearMethod → bf16):
|
||||
# Just unpack uint8→bf16 and load into bf16 param.
|
||||
#
|
||||
# For fused_wqa_wkv and gate_up_proj (NVFP4 quant → uint8):
|
||||
# Collect the uint8 weight + scales, then convert to FP8
|
||||
# using the _nvfp4_to_fp8 helper after all sub-weights load.
|
||||
if (loaded_weight.dtype == torch.uint8
|
||||
and param.data.dtype != torch.uint8
|
||||
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
|
||||
# Compressor path: unpack uint8→bf16, load into bf16 param
|
||||
even_idx = (loaded_weight & 0x0F).int()
|
||||
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
|
||||
even_vals = E2M1_LUT[even_idx]
|
||||
odd_vals = E2M1_LUT[odd_idx]
|
||||
out = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
out = out.reshape(
|
||||
loaded_weight.shape[0], -1
|
||||
).to(torch.bfloat16)
|
||||
loaded_weight = out'''
|
||||
|
||||
content = content.replace(old_stacked_unpack, new_stacked_unpack)
|
||||
print("Updated stacked params unpack block")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(content)
|
||||
print("Written to file")
|
||||
134
tmp/fix8_final.py
Normal file
134
tmp/fix8_final.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Add NVFP4->FP8 conversion methods to deepseek_v4.py"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# 1. Add conversion methods to DeepseekV4Model
|
||||
old_finalize = ' return loaded_params\n\n\nclass DeepseekV4ForCausalLM(nn.Module):'
|
||||
|
||||
new_finalize = ''' return loaded_params
|
||||
|
||||
def _convert_nvfp4_attention_to_fp8(self):
|
||||
"""Convert NVFP4 attention weights to FP8 format.
|
||||
|
||||
The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
|
||||
which requires FP8 weights + weight_scale_inv. NVFP4 weights are
|
||||
incompatible. We dequantize NVFP4->bf16, then re-quantize to FP8.
|
||||
"""
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
|
||||
shared_expert_names = {"gate_up_proj"}
|
||||
|
||||
converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, 'shared_experts'):
|
||||
for proj_name in shared_expert_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
|
||||
if converted > 0:
|
||||
logger.info_once(
|
||||
"Converted %d NVFP4 attention/shared-expert layers to FP8",
|
||||
converted,
|
||||
)
|
||||
|
||||
def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
|
||||
"""Convert a single NVFP4 Linear module to FP8 format."""
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
|
||||
# Unpack uint8 -> E2M1 FP4 -> bf16
|
||||
even_idx = (w_uint8 & 0x0F).int()
|
||||
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
|
||||
even_vals = e2m1_lut.to(device)[even_idx]
|
||||
odd_vals = e2m1_lut.to(device)[odd_idx]
|
||||
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
|
||||
|
||||
# Dequantize: bf16 = fp4 * block_scale * global_scale * input_scale
|
||||
if hasattr(mod, 'weight_scale') and hasattr(mod, 'weight_scale_2'):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = mod.input_scale.data.max().item() if hasattr(mod, 'input_scale') else 1.0
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
|
||||
# Re-quantize bf16 -> FP8 e4m3
|
||||
w_amax = w_dequant.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=device)
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
|
||||
# Replace weight param
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
|
||||
# Switch quant method to FP8 linear
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
Fp8MMQuantMethod,
|
||||
)
|
||||
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())
|
||||
|
||||
# Clean up NVFP4 params
|
||||
for attr in ('weight_scale', 'weight_scale_2', 'input_scale'):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
|
||||
|
||||
class DeepseekV4ForCausalLM(nn.Module):'''
|
||||
|
||||
c = c.replace(old_finalize, new_finalize)
|
||||
|
||||
# 2. Call it from DeepseekV4ForCausalLM.load_weights
|
||||
old_causal = ''' self.model.finalize_mega_moe_weights()
|
||||
return loaded_params'''
|
||||
|
||||
new_causal = ''' self.model.finalize_mega_moe_weights()
|
||||
# Convert NVFP4 attention weights to FP8 for compatibility with
|
||||
# the deepseek_v4_fp8_einsum kernel used in the attention forward
|
||||
self.model._convert_nvfp4_attention_to_fp8()
|
||||
return loaded_params'''
|
||||
|
||||
c = c.replace(old_causal, new_causal)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
print("Applied NVFP4->FP8 conversion methods")
|
||||
68
tmp/fix9_oa.py
Normal file
68
tmp/fix9_oa.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Update the bf16->uint8 handler to convert bf16->FP8 directly"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# The bf16->uint8 handler needs to convert bf16 weight directly to FP8
|
||||
# since o_a_proj was NOT quantized by modelopt
|
||||
# Replace the entire handler block
|
||||
|
||||
old_handler = ''' if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# Quantize bf16 → NVFP4 (E2M1 packed uint8 + scales)'''
|
||||
|
||||
new_handler = ''' if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
# o_a_proj was NOT quantized by modelopt (bf16, no scales)
|
||||
# Convert bf16 → FP8 directly, set weight_scale_inv
|
||||
w_bf16 = loaded_weight
|
||||
w_amax = w_bf16.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=w_bf16.device)
|
||||
fp8_max = torch.finfo(torch.float8_e4m3fn).max
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_bf16 / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
|
||||
# Load FP8 weight directly (bypass the uint8 param)
|
||||
# Find the module and replace weight + quant method
|
||||
parts = name.rsplit(".", 1)
|
||||
module_path = parts[0]
|
||||
mod = self
|
||||
for attr in module_path.split("."):
|
||||
if attr.isdigit():
|
||||
mod = mod[int(attr)]
|
||||
else:
|
||||
mod = getattr(mod, attr)
|
||||
# Replace weight param with FP8 version
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
# Switch quant method to FP8 linear
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
Fp8MMQuantMethod,
|
||||
)
|
||||
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())
|
||||
# Clean up NVFP4 params
|
||||
for attr in ('weight_scale', 'weight_scale_2', 'input_scale'):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
loaded_params.add(name)
|
||||
loaded_params.add(name.replace('.weight', '.weight_scale_inv'))
|
||||
continue
|
||||
|
||||
# OLD: Quantize bf16 -> NVFP4 (E2M1 packed uint8 + scales)'''
|
||||
|
||||
c = c.replace(old_handler, new_handler)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
print("Updated bf16->uint8 handler to convert to FP8 directly")
|
||||
132
tmp/fix_class_placement.py
Normal file
132
tmp/fix_class_placement.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix: move _convert_nvfp4 methods INSIDE DeepseekV4Model class (before hc_head)"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# 1. Remove the wrongly placed methods (currently after hc_head, at top level)
|
||||
# Find the second occurrence of _convert_nvfp4_attention_to_fp8 (the wrongly placed one)
|
||||
idx = c.find(" def _convert_nvfp4_attention_to_fp8(self):\n")
|
||||
if idx > 0:
|
||||
# Find the end of the methods (before class DeepseekV4ForCausalLM)
|
||||
end_marker = "\n\nclass DeepseekV4ForCausalLM(nn.Module):"
|
||||
end_idx = c.find(end_marker, idx)
|
||||
if end_idx > 0:
|
||||
c = c[:idx] + c[end_idx:]
|
||||
print("Removed wrongly placed methods")
|
||||
else:
|
||||
print("Could not find end marker")
|
||||
else:
|
||||
print("No wrongly placed methods found")
|
||||
|
||||
# 2. Insert the methods inside DeepseekV4Model, right after finalize_mega_moe_weights
|
||||
insert_after = "def finalize_mega_moe_weights(self) -> None:\n for layer in islice(self.layers, self.start_layer, self.end_layer):\n layer.ffn.finalize_mega_moe_moe_weights()\n"
|
||||
# Try a simpler approach: find the end of finalize_mega_moe_weights
|
||||
marker = " layer.ffn.finalize_mega_moe_weights()\n\n\n@torch.compile"
|
||||
if marker in c:
|
||||
methods = ''' layer.ffn.finalize_mega_moe_weights()
|
||||
|
||||
def _convert_nvfp4_attention_to_fp8(self):
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
|
||||
shared_expert_names = {"gate_up_proj"}
|
||||
converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, "shared_experts"):
|
||||
for proj_name in shared_expert_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
if converted > 0:
|
||||
logger.info_once(
|
||||
"Converted %d NVFP4 attention/shared-expert layers to FP8",
|
||||
converted,
|
||||
)
|
||||
|
||||
def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
even_idx = (w_uint8 & 0x0F).int()
|
||||
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
|
||||
even_vals = e2m1_lut.to(device)[even_idx]
|
||||
odd_vals = e2m1_lut.to(device)[odd_idx]
|
||||
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
|
||||
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = (
|
||||
mod.input_scale.data.max().item()
|
||||
if hasattr(mod, "input_scale")
|
||||
else 1.0
|
||||
)
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
w_amax = w_dequant.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=device)
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
|
||||
|
||||
@torch.compile'''
|
||||
c = c.replace(marker, methods)
|
||||
print("Inserted methods inside DeepseekV4Model class")
|
||||
else:
|
||||
print("Could not find insertion marker!")
|
||||
# Try alternate
|
||||
alt = " layer.ffn.finalize_mega_moe_weights()\n\n\n@torch.compile"
|
||||
if alt in c:
|
||||
print("Found alternate marker")
|
||||
else:
|
||||
# Just search for finalize_mega_moe_weights
|
||||
idx = c.find("finalize_mega_moe_weights()")
|
||||
print(f"Found finalize at position {idx}")
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error at line {e.lineno}: {e.msg}")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
223
tmp/fix_clean_conversion.py
Normal file
223
tmp/fix_clean_conversion.py
Normal file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/python3
|
||||
"""
|
||||
Clean rewrite of the NVFP4→FP8/bf16 conversion.
|
||||
|
||||
Strategy:
|
||||
- wo_a, fused_wqa_wkv → FP8 (used with fp8_einsum, need weight_scale_inv)
|
||||
- wq_b, wo_b, gate_up_proj → bf16 (used via .forward(), just works)
|
||||
- compressor fused_wkv_wgate → bf16 (already handled in load path)
|
||||
- MoE experts → native NVFP4 (ModelOptNvFp4FusedMoE handles it)
|
||||
"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Find and replace the entire _convert_nvfp4_attention_to_fp8 method
|
||||
# and _convert_nvfp4_module_to_fp8 method
|
||||
|
||||
import re
|
||||
|
||||
# Remove old methods and insert new ones
|
||||
# Find the method definitions
|
||||
m1_start = c.find(" def _convert_nvfp4_attention_to_fp8(self):")
|
||||
if m1_start < 0:
|
||||
print("ERROR: Could not find _convert_nvfp4_attention_to_fp8")
|
||||
exit(1)
|
||||
|
||||
# Find the end: look for the next method/class at the same or lower indent
|
||||
# after _convert_nvfp4_module_to_fp8
|
||||
m2_start = c.find(" def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):", m1_start)
|
||||
if m2_start < 0:
|
||||
print("ERROR: Could not find _convert_nvfp4_module_to_fp8")
|
||||
exit(1)
|
||||
|
||||
# Find the end of the second method
|
||||
# Scan for the next line at indent <= 4 that's not blank
|
||||
pos = m2_start
|
||||
lines_after = c[m2_start:].split('\n')
|
||||
end_line = 0
|
||||
for i, line in enumerate(lines_after[1:], 1):
|
||||
if line.strip() == '':
|
||||
continue
|
||||
indent = len(line) - len(line.lstrip())
|
||||
if indent <= 4:
|
||||
end_line = i
|
||||
break
|
||||
|
||||
# Calculate the end position
|
||||
end_pos = m2_start + sum(len(l) + 1 for l in lines_after[:end_line])
|
||||
|
||||
new_methods = ''' def _convert_nvfp4_post_load(self):
|
||||
"""Post-load conversion of NVFP4 weights for vLLM compatibility.
|
||||
|
||||
Strategy:
|
||||
- wo_a, fused_wqa_wkv: Convert NVFP4->FP8 (used with fp8_einsum)
|
||||
- wq_b, wo_b, gate_up_proj: Dequant NVFP4->bf16 (used via .forward())
|
||||
- MoE experts: Stay in native NVFP4 (ModelOptNvFp4FusedMoE)
|
||||
"""
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
# Layers that use fp8_einsum (need FP8 + weight_scale_inv)
|
||||
fp8_proj_names = {"fused_wqa_wkv", "wo_a"}
|
||||
# Layers that use normal .forward() (need bf16)
|
||||
bf16_proj_names = {"wq_b", "wo_b"}
|
||||
bf16_shared_names = {"gate_up_proj"}
|
||||
|
||||
fp8_converted = 0
|
||||
bf16_converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in fp8_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
fp8_converted += 1
|
||||
for proj_name in bf16_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._dequant_nvfp4_to_bf16(mod, E2M1_LUT)
|
||||
bf16_converted += 1
|
||||
# Shared experts
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, "shared_experts"):
|
||||
for proj_name in bf16_shared_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._dequant_nvfp4_to_bf16(mod, E2M1_LUT)
|
||||
bf16_converted += 1
|
||||
|
||||
if fp8_converted > 0 or bf16_converted > 0:
|
||||
print(f"NVFP4 post-load: {fp8_converted} layers -> FP8, "
|
||||
f"{bf16_converted} layers -> bf16, MoE experts stay NVFP4")
|
||||
|
||||
def _dequant_nvfp4_to_bf16(self, mod, e2m1_lut):
|
||||
"""Dequantize NVFP4 weight to bf16 for normal .forward() path."""
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
w_bf16 = self._unpack_nvfp4_to_bf16(w_uint8, e2m1_lut, device)
|
||||
|
||||
# Dequantize with scales
|
||||
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = (
|
||||
mod.input_scale.data.max().item()
|
||||
if hasattr(mod, "input_scale")
|
||||
else 1.0
|
||||
)
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
|
||||
# Replace weight with bf16 version
|
||||
mod.weight = torch.nn.Parameter(w_dequant, requires_grad=False)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale",
|
||||
"weight_scale_inv"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
|
||||
def _convert_nvfp4_to_fp8(self, mod, e2m1_lut, fp8_max):
|
||||
"""Convert NVFP4 weight to FP8 for fp8_einsum path."""
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
w_bf16 = self._unpack_nvfp4_to_bf16(w_uint8, e2m1_lut, device)
|
||||
|
||||
# Dequantize with scales
|
||||
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = (
|
||||
mod.input_scale.data.max().item()
|
||||
if hasattr(mod, "input_scale")
|
||||
else 1.0
|
||||
)
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
|
||||
# Re-quantize bf16 -> FP8 e4m3
|
||||
w_amax = w_dequant.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=device)
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
|
||||
def _unpack_nvfp4_to_bf16(self, w_uint8, e2m1_lut, device):
|
||||
"""Unpack NVFP4 uint8 packed weights to bf16 using E2M1 format."""
|
||||
# Extract 4-bit FP4 values (0-15, bit 3 = sign)
|
||||
even_raw = (w_uint8 & 0x0F).int()
|
||||
odd_raw = ((w_uint8 >> 4) & 0x0F).int()
|
||||
# Sign: 0-7 = positive, 8-15 = negative
|
||||
even_sign = torch.where(even_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
odd_sign = torch.where(odd_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
# Magnitude index: lower 3 bits (0-7)
|
||||
even_vals = even_sign * e2m1_lut.to(device)[even_raw & 0x07]
|
||||
odd_vals = odd_sign * e2m1_lut.to(device)[odd_raw & 0x07]
|
||||
# Interleave and flatten
|
||||
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
|
||||
return w_bf16
|
||||
'''
|
||||
|
||||
c = c[:m1_start] + new_methods + c[end_pos:]
|
||||
|
||||
# Also update the call from DeepseekV4ForCausalLM.load_weights
|
||||
c = c.replace(
|
||||
"self.model._convert_nvfp4_attention_to_fp8()",
|
||||
"self.model._convert_nvfp4_post_load()"
|
||||
)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error at line {e.lineno}: {e.msg}")
|
||||
|
||||
print("Replaced conversion methods with clean FP8/bf16 split")
|
||||
53
tmp/fix_e2m1.py
Normal file
53
tmp/fix_e2m1.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix the E2M1 unpacking in _convert_nvfp4_module_to_fp8"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Fix the unpacking code in _convert_nvfp4_module_to_fp8
|
||||
old_unpack = ''' even_idx = (w_uint8 & 0x0F).int()
|
||||
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
|
||||
even_vals = e2m1_lut.to(device)[even_idx]
|
||||
odd_vals = e2m1_lut.to(device)[odd_idx]'''
|
||||
|
||||
new_unpack = ''' # Extract 4-bit FP4 values (0-15, bit 3 = sign)
|
||||
even_raw = (w_uint8 & 0x0F).int()
|
||||
odd_raw = ((w_uint8 >> 4) & 0x0F).int()
|
||||
# Sign: 0-7 = positive, 8-15 = negative
|
||||
even_sign = torch.where(even_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
odd_sign = torch.where(odd_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
# Magnitude index: lower 3 bits (0-7)
|
||||
even_vals = even_sign * e2m1_lut.to(device)[even_raw & 0x07]
|
||||
odd_vals = odd_sign * e2m1_lut.to(device)[odd_raw & 0x07]'''
|
||||
|
||||
c = c.replace(old_unpack, new_unpack)
|
||||
print("Fixed E2M1 unpacking in _convert_nvfp4_module_to_fp8")
|
||||
|
||||
# Also fix the E2M1 unpacking in the stacked params code
|
||||
old_stacked_unpack = ''' even_idx = (loaded_weight & 0x0F).int()
|
||||
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
|
||||
even_vals = E2M1_LUT[even_idx]
|
||||
odd_vals = E2M1_LUT[odd_idx]'''
|
||||
|
||||
new_stacked_unpack = ''' # Extract 4-bit FP4 values (0-15, bit 3 = sign)
|
||||
even_raw = (loaded_weight & 0x0F).int()
|
||||
odd_raw = ((loaded_weight >> 4) & 0x0F).int()
|
||||
even_sign = torch.where(even_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
odd_sign = torch.where(odd_raw >= 8, -1.0, 1.0).to(torch.bfloat16)
|
||||
even_vals = even_sign * E2M1_LUT[even_raw & 0x07]
|
||||
odd_vals = odd_sign * E2M1_LUT[odd_raw & 0x07]'''
|
||||
|
||||
c = c.replace(old_stacked_unpack, new_stacked_unpack)
|
||||
print("Fixed E2M1 unpacking in stacked params code")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error: {e}")
|
||||
129
tmp/fix_indent.py
Normal file
129
tmp/fix_indent.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix the placement of _convert_nvfp4 methods - move inside DeepseekV4Model"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Remove the wrongly placed methods (at top level, 0 indent)
|
||||
# Find and remove the block between the marker and the class definition
|
||||
marker = " def _convert_nvfp4_attention_to_fp8(self):\n"
|
||||
class_marker = "\nclass DeepseekV4ForCausalLM(nn.Module):"
|
||||
|
||||
# Find the wrongly placed methods and remove them
|
||||
idx = c.find(" def _convert_nvfp4_attention_to_fp8(self):\n")
|
||||
class_idx = c.find("\n\nclass DeepseekV4ForCausalLM(nn.Module):")
|
||||
|
||||
if idx > 0 and class_idx > 0 and idx < class_idx:
|
||||
# Remove the wrongly placed methods
|
||||
# Find the start of the blank lines before the methods
|
||||
search_start = idx
|
||||
while search_start > 0 and c[search_start-1] == '\n':
|
||||
search_start -= 1
|
||||
|
||||
c = c[:search_start] + c[class_idx:]
|
||||
print(f"Removed wrongly placed methods (chars {search_start}-{class_idx})")
|
||||
else:
|
||||
print(f"Could not find wrongly placed methods: idx={idx}, class_idx={class_idx}")
|
||||
|
||||
# Now insert the methods INSIDE DeepseekV4Model class, right before
|
||||
# the line that precedes DeepseekV4ForCausalLM
|
||||
# Find the last method of DeepseekV4Model before the class boundary
|
||||
# Insert before "class DeepseekV4ForCausalLM"
|
||||
insert_point = c.find("\n\nclass DeepseekV4ForCausalLM(nn.Module):")
|
||||
if insert_point < 0:
|
||||
print("ERROR: Could not find class marker")
|
||||
else:
|
||||
# The methods need to be at 4-space indent (class method level)
|
||||
methods = '''
|
||||
def _convert_nvfp4_attention_to_fp8(self):
|
||||
E2M1_LUT = torch.tensor(
|
||||
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
|
||||
)
|
||||
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
|
||||
shared_expert_names = {"gate_up_proj"}
|
||||
converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, "shared_experts"):
|
||||
for proj_name in shared_expert_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
if converted > 0:
|
||||
logger.info_once(
|
||||
"Converted %d NVFP4 attention/shared-expert layers to FP8",
|
||||
converted,
|
||||
)
|
||||
|
||||
def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
|
||||
w_uint8 = mod.weight.data
|
||||
device = w_uint8.device
|
||||
even_idx = (w_uint8 & 0x0F).int()
|
||||
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
|
||||
even_vals = e2m1_lut.to(device)[even_idx]
|
||||
odd_vals = e2m1_lut.to(device)[odd_idx]
|
||||
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
|
||||
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
|
||||
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
|
||||
block_scale = mod.weight_scale.data.to(torch.float32)
|
||||
if block_scale.dim() == 2 and w_bf16.dim() == 2:
|
||||
block_size = w_bf16.shape[1] // block_scale.shape[1]
|
||||
block_scale_expanded = block_scale.unsqueeze(-1).expand(
|
||||
-1, -1, block_size
|
||||
).reshape(w_bf16.shape)
|
||||
else:
|
||||
block_scale_expanded = block_scale
|
||||
global_scale = mod.weight_scale_2.data.max().item()
|
||||
input_scale = (
|
||||
mod.input_scale.data.max().item()
|
||||
if hasattr(mod, "input_scale")
|
||||
else 1.0
|
||||
)
|
||||
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
|
||||
w_dequant = w_dequant.to(torch.bfloat16)
|
||||
else:
|
||||
w_dequant = w_bf16
|
||||
w_amax = w_dequant.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=device)
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
'''
|
||||
c = c[:insert_point] + methods + c[insert_point:]
|
||||
print("Inserted methods at correct indentation level")
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error: {e}")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
37
tmp/fix_logger.py
Normal file
37
tmp/fix_logger.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix the logger.info_once call and any syntax issues"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Find and fix the logger/print issue
|
||||
new_lines = []
|
||||
skip_until_paren = False
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
|
||||
# Replace the print( call that was replacing logger.info_once
|
||||
if 'print(' in line and 'Converted %d' in line:
|
||||
new_lines.append(' if converted > 0:\n')
|
||||
new_lines.append(' print(f"Converted {converted} NVFP4 attention/shared-expert layers to FP8")\n')
|
||||
skip_until_paren = True
|
||||
continue
|
||||
|
||||
if skip_until_paren:
|
||||
if ')' in line:
|
||||
skip_until_paren = False
|
||||
continue
|
||||
|
||||
new_lines.append(line)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.writelines(new_lines)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(''.join(new_lines))
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error: {e}")
|
||||
44
tmp/fix_logger2.py
Normal file
44
tmp/fix_logger2.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/python3
|
||||
"""Clean up the broken logger replacement"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
new_lines = []
|
||||
skip = False
|
||||
for i, line in enumerate(lines):
|
||||
if skip:
|
||||
if ')' in line:
|
||||
skip = False
|
||||
continue
|
||||
|
||||
# Fix the broken logger replacement
|
||||
if '# logger.info_once(' in line or 'logger.info_once(' in line:
|
||||
if '"Converted %d NVFP4' in lines[i+1] if i+1 < len(lines) else False:
|
||||
# Replace the whole block
|
||||
new_lines.append(' print(f"Converted {converted} NVFP4 layers to FP8")\n')
|
||||
skip = True
|
||||
continue
|
||||
else:
|
||||
new_lines.append(line)
|
||||
continue
|
||||
|
||||
# Also remove orphaned lines from the old block
|
||||
if '"Converted %d NVFP4 attention/shared-expert layers to FP8",' in line:
|
||||
continue
|
||||
if line.strip() == 'converted,':
|
||||
continue
|
||||
|
||||
new_lines.append(line)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.writelines(new_lines)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(''.join(new_lines))
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error at line {e.lineno}: {e.msg}")
|
||||
78
tmp/fix_oa_fp8.py
Normal file
78
tmp/fix_oa_fp8.py
Normal file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/python3
|
||||
"""Replace the bf16->NVFP4 quantization handler with a simple bf16->FP8 conversion.
|
||||
wo_a is used with fp8_einsum, so it needs FP8 weight + weight_scale_inv."""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Find and replace the entire bf16->uint8 handler block
|
||||
# It starts with the o_a_proj comment and ends with "continue"
|
||||
import re
|
||||
|
||||
# Find the handler
|
||||
m = re.search(
|
||||
r"( # Handle o_a_proj bf16 → wo_a uint8 mismatch:.*?)(\n continue\n)",
|
||||
c,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
if m:
|
||||
# Replace with bf16->FP8 conversion
|
||||
new_handler = ''' # Handle o_a_proj bf16 -> wo_a: convert to FP8 directly
|
||||
# (wo_a is used with fp8_einsum, needs FP8 + weight_scale_inv)
|
||||
if (name.endswith(".weight")
|
||||
and loaded_weight.dtype != torch.uint8
|
||||
and param.data.dtype == torch.uint8):
|
||||
w_bf16 = loaded_weight
|
||||
w_amax = w_bf16.abs().amax()
|
||||
if w_amax == 0:
|
||||
w_amax = torch.tensor(1.0, device=w_bf16.device)
|
||||
fp8_max = torch.finfo(torch.float8_e4m3fn).max
|
||||
fp8_scale = w_amax / fp8_max
|
||||
w_fp8 = (w_bf16 / fp8_scale).to(torch.float8_e4m3fn)
|
||||
weight_scale_inv = fp8_scale.to(torch.float32)
|
||||
parts = name.rsplit(".", 1)
|
||||
module_path = parts[0]
|
||||
mod = self
|
||||
for attr in module_path.split("."):
|
||||
if attr.isdigit():
|
||||
mod = mod[int(attr)]
|
||||
else:
|
||||
mod = getattr(mod, attr)
|
||||
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
|
||||
mod.weight_scale_inv = torch.nn.Parameter(
|
||||
weight_scale_inv.reshape(1), requires_grad=False
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
mod.quant_method = UnquantizedLinearMethod()
|
||||
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
|
||||
if hasattr(mod, attr):
|
||||
delattr(mod, attr)
|
||||
loaded_params.add(name)
|
||||
loaded_params.add(name.replace(".weight", ".weight_scale_inv"))
|
||||
continue
|
||||
'''
|
||||
c = c[:m.start()] + new_handler + c[m.end():]
|
||||
print("Replaced bf16->NVFP4 handler with bf16->FP8 handler")
|
||||
else:
|
||||
print("Could not find handler block, trying alternate search...")
|
||||
# Try finding just the if condition
|
||||
idx = c.find("and loaded_weight.dtype != torch.uint8\n")
|
||||
if idx > 0:
|
||||
print(f"Found condition at position {idx}")
|
||||
else:
|
||||
print("ERROR: Could not find condition")
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error at line {e.lineno}: {e.msg}")
|
||||
94
tmp/fix_replace_handler.py
Normal file
94
tmp/fix_replace_handler.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/python3
|
||||
"""Replace the old bf16->NVFP4 handler with a simple bf16->FP8 handler."""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Find the handler: starts with the if check, ends with continue
|
||||
start = None
|
||||
end = None
|
||||
for i, line in enumerate(lines):
|
||||
if 'loaded_weight.dtype != torch.uint8' in line and 'param.data.dtype == torch.uint8' in line:
|
||||
# Go back to find the if statement start
|
||||
for j in range(i, max(i-3, 0), -1):
|
||||
if lines[j].strip().startswith('if (name.endswith'):
|
||||
start = j
|
||||
break
|
||||
if start is None:
|
||||
start = i # fallback
|
||||
break
|
||||
|
||||
if start is None:
|
||||
print("Could not find handler start")
|
||||
exit(1)
|
||||
|
||||
# Find the end: the first 'continue' at indent level 20+ after start
|
||||
for i in range(start + 1, min(start + 200, len(lines))):
|
||||
stripped = lines[i].strip()
|
||||
if stripped == 'continue':
|
||||
indent = len(lines[i]) - len(lines[i].lstrip())
|
||||
if indent >= 20:
|
||||
end = i
|
||||
break
|
||||
|
||||
if end is None:
|
||||
print("Could not find handler end")
|
||||
exit(1)
|
||||
|
||||
print(f"Replacing lines {start+1} to {end+1} ({end-start+1} lines)")
|
||||
print(f"First: {lines[start].rstrip()[:80]}")
|
||||
print(f"Last: {lines[end].rstrip()[:80]}")
|
||||
|
||||
new_handler = [
|
||||
' if (name.endswith(".weight")\n',
|
||||
' and loaded_weight.dtype != torch.uint8\n',
|
||||
' and param.data.dtype == torch.uint8):\n',
|
||||
' # o_a_proj was NOT quantized by modelopt (bf16, no scales)\n',
|
||||
' # wo_a is used with fp8_einsum: convert bf16 -> FP8 directly\n',
|
||||
' w_bf16 = loaded_weight\n',
|
||||
' w_amax = w_bf16.abs().amax()\n',
|
||||
' if w_amax == 0:\n',
|
||||
' w_amax = torch.tensor(1.0, device=w_bf16.device)\n',
|
||||
' fp8_max = torch.finfo(torch.float8_e4m3fn).max\n',
|
||||
' fp8_scale = w_amax / fp8_max\n',
|
||||
' w_fp8 = (w_bf16 / fp8_scale).to(torch.float8_e4m3fn)\n',
|
||||
' weight_scale_inv = fp8_scale.to(torch.float32)\n',
|
||||
' parts = name.rsplit(".", 1)\n',
|
||||
' module_path = parts[0]\n',
|
||||
' mod = self\n',
|
||||
' for attr in module_path.split("."):\n',
|
||||
' if attr.isdigit():\n',
|
||||
' mod = mod[int(attr)]\n',
|
||||
' else:\n',
|
||||
' mod = getattr(mod, attr)\n',
|
||||
' mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)\n',
|
||||
' mod.weight_scale_inv = torch.nn.Parameter(\n',
|
||||
' weight_scale_inv.reshape(1), requires_grad=False\n',
|
||||
' )\n',
|
||||
' from vllm.model_executor.layers.linear import (\n',
|
||||
' UnquantizedLinearMethod,\n',
|
||||
' )\n',
|
||||
' mod.quant_method = UnquantizedLinearMethod()\n',
|
||||
' for attr in ("weight_scale", "weight_scale_2", "input_scale"):\n',
|
||||
' if hasattr(mod, attr):\n',
|
||||
' delattr(mod, attr)\n',
|
||||
' loaded_params.add(name)\n',
|
||||
' loaded_params.add(name.replace(".weight", ".weight_scale_inv"))\n',
|
||||
' continue\n',
|
||||
]
|
||||
|
||||
lines[start:end+1] = new_handler
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.writelines(lines)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(''.join(lines))
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error at line {e.lineno}: {e.msg}")
|
||||
|
||||
print(f"Replaced {end-start+1} lines with {len(new_handler)} lines")
|
||||
64
tmp/fix_selective_fp8.py
Normal file
64
tmp/fix_selective_fp8.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix: Only convert wo_a and fused_wqa_wkv to FP8 (used with fp8_einsum).
|
||||
Keep wq_b, wo_b, gate_up_proj in NVFP4 (used via normal .forward())."""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
c = f.read()
|
||||
|
||||
# Fix 1: Change the list of projections to convert
|
||||
old_proj_names = 'attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}'
|
||||
new_proj_names = 'attn_proj_names = {"fused_wqa_wkv", "wo_a"} # Only these use fp8_einsum'
|
||||
c = c.replace(old_proj_names, new_proj_names)
|
||||
|
||||
# Fix 2: Remove shared_experts gate_up_proj from conversion
|
||||
old_shared = ''' shared_expert_names = {"gate_up_proj"}
|
||||
converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
ffn = layer.ffn
|
||||
if hasattr(ffn, "shared_experts"):
|
||||
for proj_name in shared_expert_names:
|
||||
if not hasattr(ffn.shared_experts, proj_name):
|
||||
continue
|
||||
mod = getattr(ffn.shared_experts, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1'''
|
||||
|
||||
new_shared = ''' converted = 0
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
attn = layer.attn
|
||||
for proj_name in attn_proj_names:
|
||||
if not hasattr(attn, proj_name):
|
||||
continue
|
||||
mod = getattr(attn, proj_name)
|
||||
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
|
||||
continue
|
||||
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
|
||||
converted += 1
|
||||
# wq_b, wo_b, gate_up_proj stay in NVFP4 (normal .forward() path)'''
|
||||
|
||||
c = c.replace(old_shared, new_shared)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(c)
|
||||
|
||||
import ast
|
||||
try:
|
||||
ast.parse(c)
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error: {e}")
|
||||
|
||||
print("Updated: only fused_wqa_wkv and wo_a converted to FP8")
|
||||
41
tmp/fix_syntax.py
Normal file
41
tmp/fix_syntax.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/python3
|
||||
"""Fix the broken docstrings in deepseek_v4.py"""
|
||||
|
||||
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replace the problematic triple-quoted docstrings with comments
|
||||
content = content.replace(
|
||||
' def _convert_nvfp4_attention_to_fp8(self):\n """Convert NVFP4 attention weights to FP8 format."""',
|
||||
' def _convert_nvfp4_attention_to_fp8(self): # Convert NVFP4 attention weights to FP8'
|
||||
)
|
||||
|
||||
content = content.replace(
|
||||
' def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):\n """Convert a single NVFP4 Linear module to FP8 format."""',
|
||||
' def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max): # Convert single NVFP4 module to FP8'
|
||||
)
|
||||
|
||||
# Also remove any stray docstring lines
|
||||
lines = content.split('\n')
|
||||
new_lines = []
|
||||
skip_next = False
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
if stripped == '"""' or stripped == "'''":
|
||||
continue # Skip standalone triple-quote lines
|
||||
if '"""Convert NVFP4' in line or '"""Convert a single' in line:
|
||||
continue # Skip the one-line docstrings that might remain
|
||||
new_lines.append(line)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write('\n'.join(new_lines))
|
||||
|
||||
# Verify syntax
|
||||
import ast
|
||||
try:
|
||||
ast.parse('\n'.join(new_lines))
|
||||
print("Syntax OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax error: {e}")
|
||||
179
verify_nvfp4.py
179
verify_nvfp4.py
@@ -1,179 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sanity check an NVFP4 DeepSeek V4 Pro checkpoint.
|
||||
|
||||
Two modes:
|
||||
|
||||
1) --tensor-only (default): no model loading. Just inspects the safetensors
|
||||
shards: confirms NVFP4 packing structure (uint8 weight + FP8 weight_scale
|
||||
+ FP32 weight_scale_2), checks for NaN/Inf in scales, samples a few
|
||||
dequantizations to confirm they look plausible.
|
||||
|
||||
2) --vllm: tries to load the model with vLLM and generate a few tokens.
|
||||
Requires vLLM with NVFP4 support (SM100+ Blackwell GPU).
|
||||
|
||||
Usage:
|
||||
python verify_nvfp4.py DeepSeek-V4-Pro-NVFP4-streaming
|
||||
python verify_nvfp4.py DeepSeek-V4-Pro-NVFP4-streaming --vllm
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
|
||||
|
||||
FP4_E2M1_VALUES = torch.tensor(
|
||||
[0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
|
||||
-0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0],
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
|
||||
def unpack_fp4(packed: torch.Tensor) -> torch.Tensor:
|
||||
"""Reverse the (low | high<<4) byte pack into a [M, N] tensor of FP4 indices."""
|
||||
low = packed & 0x0F
|
||||
high = (packed >> 4) & 0x0F
|
||||
M, N_half = packed.shape
|
||||
out = torch.empty(M, N_half * 2, dtype=torch.uint8)
|
||||
out[:, ::2] = low
|
||||
out[:, 1::2] = high
|
||||
return out
|
||||
|
||||
|
||||
def dequant_nvfp4(packed_uint8, weight_scale_fp8, weight_scale_2_fp32):
|
||||
"""Reconstruct FP32 values from NVFP4 storage."""
|
||||
fp4_idx = unpack_fp4(packed_uint8)
|
||||
values = FP4_E2M1_VALUES[fp4_idx.long()] # [M, N]
|
||||
M, N = values.shape
|
||||
# Per-block scale broadcast back over 16 elements
|
||||
scale_blocks = weight_scale_fp8.float() # [M, N//16]
|
||||
scale_per_elem = scale_blocks.unsqueeze(-1).expand(-1, -1, 16).reshape(M, N)
|
||||
return values * scale_per_elem * weight_scale_2_fp32.float()
|
||||
|
||||
|
||||
def tensor_only_check(model_dir: Path):
|
||||
index_path = model_dir / "model.safetensors.index.json"
|
||||
if not index_path.exists():
|
||||
sys.exit(f"No index.json at {model_dir}")
|
||||
with open(index_path) as f:
|
||||
index = json.load(f)
|
||||
weight_map = index["weight_map"]
|
||||
|
||||
# Find one quantized weight to sample
|
||||
sample = None
|
||||
for name, fn in weight_map.items():
|
||||
if name.endswith(".weight") and (name.replace(".weight", ".weight_scale") in weight_map):
|
||||
sample = name
|
||||
break
|
||||
if not sample:
|
||||
sys.exit("Couldn't find an NVFP4-quantized weight (expected *.weight_scale companion).")
|
||||
|
||||
print(f"Sampling: {sample}")
|
||||
shard_fn = weight_map[sample]
|
||||
scale_name = sample.replace(".weight", ".weight_scale")
|
||||
scale_2_name = sample.replace(".weight", ".weight_scale_2")
|
||||
scale_shard = weight_map[scale_name]
|
||||
scale_2_shard = weight_map[scale_2_name]
|
||||
|
||||
def open_get(fn, name):
|
||||
with safe_open(model_dir / fn, framework="pt") as f:
|
||||
return f.get_tensor(name)
|
||||
|
||||
packed = open_get(shard_fn, sample)
|
||||
weight_scale = open_get(scale_shard, scale_name)
|
||||
weight_scale_2 = open_get(scale_2_shard, scale_2_name)
|
||||
|
||||
print(f" packed: shape={tuple(packed.shape)} dtype={packed.dtype}")
|
||||
print(f" weight_scale: shape={tuple(weight_scale.shape)} dtype={weight_scale.dtype}")
|
||||
print(f" weight_scale_2: shape={tuple(weight_scale_2.shape)} dtype={weight_scale_2.dtype} "
|
||||
f"value={weight_scale_2.float().item():.6e}")
|
||||
|
||||
# Structural assertions
|
||||
M = packed.shape[0]
|
||||
assert packed.dtype == torch.uint8, f"packed should be uint8, got {packed.dtype}"
|
||||
assert weight_scale.dtype == torch.float8_e4m3fn, \
|
||||
f"weight_scale should be FP8 E4M3, got {weight_scale.dtype}"
|
||||
assert weight_scale.shape == (M, packed.shape[1] * 2 // 16), \
|
||||
f"weight_scale shape {weight_scale.shape} doesn't match expected (M, N/16)"
|
||||
|
||||
# Check for NaN/Inf in scales
|
||||
s_fp32 = weight_scale.float()
|
||||
assert torch.isfinite(s_fp32).all(), "weight_scale contains NaN/Inf"
|
||||
assert torch.isfinite(weight_scale_2.float()).all(), "weight_scale_2 is NaN/Inf"
|
||||
print(f" scales: all finite ✓")
|
||||
print(f" weight_scale stats: min={s_fp32.min().item():.3e} max={s_fp32.max().item():.3e} "
|
||||
f"mean={s_fp32.mean().item():.3e}")
|
||||
|
||||
# Spot-check dequantization
|
||||
print("\nDequantizing first 4x32 block for visual check:")
|
||||
rec = dequant_nvfp4(packed[:4, :16], weight_scale[:4, :2], weight_scale_2)
|
||||
print(rec)
|
||||
assert torch.isfinite(rec).all(), "Dequantized values contain NaN/Inf"
|
||||
print(f" dequant: all finite ✓")
|
||||
print(f" dequant range: [{rec.min().item():.4f}, {rec.max().item():.4f}]")
|
||||
|
||||
# Count what's quantized vs preserved across the whole model
|
||||
quantized_weights = []
|
||||
preserved = []
|
||||
for name in weight_map:
|
||||
if name.endswith(".weight"):
|
||||
if name.replace(".weight", ".weight_scale") in weight_map:
|
||||
quantized_weights.append(name)
|
||||
else:
|
||||
preserved.append(name)
|
||||
|
||||
print(f"\nWhole-model summary:")
|
||||
print(f" Quantized .weight tensors: {len(quantized_weights):,}")
|
||||
print(f" Preserved .weight tensors: {len(preserved):,}")
|
||||
print(f" Total tensors in index: {len(weight_map):,}")
|
||||
|
||||
# Show a few preserved names to confirm the right things stayed in higher precision
|
||||
print(f"\n Sample preserved tensors (should be lm_head, embed, gates, norms, etc.):")
|
||||
for n in preserved[:10]:
|
||||
print(f" {n}")
|
||||
|
||||
|
||||
def vllm_check(model_dir: Path):
|
||||
print("Loading model with vLLM... (requires Blackwell GPU + vLLM with NVFP4 support)")
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(
|
||||
model=str(model_dir),
|
||||
trust_remote_code=True,
|
||||
quantization="compressed-tensors",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=8,
|
||||
max_model_len=8192,
|
||||
)
|
||||
sampling = SamplingParams(temperature=1.0, top_p=1.0, max_tokens=64)
|
||||
|
||||
prompts = [
|
||||
"Write a short poem about quantization:",
|
||||
"What is 17 * 23?",
|
||||
"Explain MoE routing in one sentence.",
|
||||
]
|
||||
outputs = llm.generate(prompts, sampling)
|
||||
for o in outputs:
|
||||
print("=" * 60)
|
||||
print("PROMPT:", o.prompt)
|
||||
print("OUTPUT:", o.outputs[0].text)
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("model_dir")
|
||||
ap.add_argument("--vllm", action="store_true")
|
||||
args = ap.parse_args()
|
||||
model_dir = Path(args.model_dir)
|
||||
|
||||
tensor_only_check(model_dir)
|
||||
if args.vllm:
|
||||
print("\n" + "=" * 60)
|
||||
vllm_check(model_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user