Files
deepseek-v4-quant/tmp/fix6_nvfp4_to_fp8.py
biondizzle 02b8ea536f Update MEMORY.md and memory files with vLLM NVFP4 serving progress
Server running on B200 port 8000 with full NVFP4→vLLM bridge.
All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values.
2026-05-11 02:02:49 +00:00

82 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Replace the current NVFP4 weight handling with a comprehensive
NVFP4→bf16→FP8 re-quantization path for attention layers.
The vLLM DeepSeekV4 attention forward code uses deepseek_v4_fp8_einsum
which requires FP8 weights + weight_scale_inv. NVFP4 weights (uint8 packed
FP4 + per-block scales + per-tensor scales) are incompatible with this kernel.
Solution: At load time, dequantize all NVFP4 attention weights to bf16,
then re-quantize to FP8. Store the FP8 weights + weight_scale_inv.
The existing FP8 attention forward code then works without modification.
For compressor fused_wkv_wgate: stays bf16 (UnquantizedLinearMethod).
For MoE experts: handled by ModelOptNvFp4FusedMoE natively.
For shared experts gate_up_proj: also needs FP8 conversion.
"""
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
with open(filepath, 'r') as f:
content = f.read()
# ============================================================
# Helper function: add the NVFP4→FP8 conversion utility
# at the top of the load_weights method
# ============================================================
old_load_weights_start = ''' def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
# Define E2M1 FP4 → BF16 lookup table for unpacking
E2M1_LUT = torch.tensor(
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
)'''
new_load_weights_start = ''' def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
# Define E2M1 FP4 → BF16 lookup table for unpacking
E2M1_LUT = torch.tensor(
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
)
# ── NVFP4 → FP8 re-quantization helper ──────────────────
# The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
# which requires FP8 weights + weight_scale_inv. ModelOpt NVFP4
# stores uint8 packed FP4 + per-block/per-tensor scales.
# We dequantize NVFP4→bf16, then re-quantize to FP8 at load time.
def _nvfp4_to_fp8(w_uint8, w_scale, w_scale_2, w_input_scale):
"""Convert NVFP4 weight + scales to FP8 weight + weight_scale_inv."""
# Unpack uint8 → E2M1 FP4 → bf16
even = (w_uint8 & 0x0F).int()
odd = ((w_uint8 >> 4) & 0x0F).int()
# Interleave back
bf16_even = E2M1_LUT.to(w_uint8.device)[even]
bf16_odd = E2M1_LUT.to(w_uint8.device)[odd]
# Stack along last dim and flatten
w_bf16 = torch.stack([bf16_even, bf16_odd], dim=-1)
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1) # [out, in_dim]
# Dequantize: bf16_val = fp4 * block_scale * global_scale * input_scale
if w_scale.dim() == 2:
block_scale = w_scale.to(torch.float32).unsqueeze(-1) # [out, blocks, 1]
w_bf16_scaled = w_bf16.float() * block_scale.reshape(
w_bf16.shape[0], -1) * w_scale_2.item() * w_input_scale.item()
else:
w_bf16_scaled = w_bf16.float() * w_scale_2.item() * w_input_scale.item()
w_bf16_scaled = w_bf16_scaled.to(torch.bfloat16)
# Re-quantize bf16 → FP8 e4m3
w_amax = w_bf16_scaled.abs().amax()
fp8_scale = w_amax / torch.finfo(torch.float8_e4m3fn).max
w_fp8 = (w_bf16_scaled / fp8_scale).to(torch.float8_e4m3fn)
weight_scale_inv = fp8_scale.to(torch.float32)
return w_fp8, weight_scale_inv
# ── End helper ──────────────────────────────────────────
'''
content = content.replace(old_load_weights_start, new_load_weights_start)
print("Added NVFP4→FP8 helper function")
with open(filepath, 'w') as f:
f.write(content)
print("Written to file")