Files
deepseek-v4-quant/tmp/fix7_stacked.py
biondizzle 02b8ea536f Update MEMORY.md and memory files with vLLM NVFP4 serving progress
Server running on B200 port 8000 with full NVFP4→vLLM bridge.
All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values.
2026-05-11 02:02:49 +00:00

89 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Update the load_weights method to use NVFP4→FP8 conversion for attention layers.
Key changes:
1. Stacked params (fused_wqa_wkv): when uint8, dequantize to bf16, re-quantize to FP8,
then load as FP8 weight + weight_scale_inv
2. Non-stacked params (wq_b, wo_a, wo_b, gate_up_proj): same treatment
3. Compressor fused_wkv_wgate: stays as bf16 (E2M1 unpack only)
4. Remove the separate bf16→uint8 handler (no longer needed since we go to FP8)
"""
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
with open(filepath, 'r') as f:
content = f.read()
# ============================================================
# Replace the stacked params loading block
# The current code unpacks uint8→bf16 for MergedColumnParallelLinear
# We need to instead convert NVFP4→FP8 for attention/shared expert
# and unpack→bf16 for compressor
# ============================================================
old_stacked_unpack = ''' # ModelOpt NVFP4 packed weight fix for MergedColumnParallelLinear.
#
# modelopt exports NVFP4 packed weights as uint8 (2 values/byte
# along the column dim). But MergedColumnParallelLinear creates
# weight as bf16 (not PackedColumnParameter uint8) because
# ModelOptNvFp4Config only handles Linear, not
# MergedColumnParallelLinear.
#
# For compressor fused_wkv_wgate (quant_config=None →
# UnquantizedLinearMethod → bf16 weight):
# Unpack uint8→bf16 using E2M1 LUT, load into bf16 param.
#
# For fused_wqa_wkv (NVFP4 quant method → uint8 weight):
# The weight param IS uint8, so no unpacking needed.
# Just load the packed uint8 weight directly.
# Scales are loaded separately (no longer skipped).
if (loaded_weight.dtype == torch.uint8
and param.data.dtype != torch.uint8
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
# Unpack NVFP4 (E2M1) → BF16
# E2M1 LUT: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
even_idx = (loaded_weight & 0x0F).int()
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
even_vals = E2M1_LUT[even_idx]
odd_vals = E2M1_LUT[odd_idx]
# Interleave even and odd along the last dim
out = torch.stack([even_vals, odd_vals], dim=-1)
out = out.reshape(
loaded_weight.shape[0], -1
).to(torch.bfloat16)
loaded_weight = out'''
new_stacked_unpack = ''' # ModelOpt NVFP4 weight handling for stacked params.
#
# The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
# which requires FP8 weights + weight_scale_inv. NVFP4 weights are
# incompatible. We convert NVFP4→bf16→FP8 at load time.
#
# For compressor fused_wkv_wgate (UnquantizedLinearMethod → bf16):
# Just unpack uint8→bf16 and load into bf16 param.
#
# For fused_wqa_wkv and gate_up_proj (NVFP4 quant → uint8):
# Collect the uint8 weight + scales, then convert to FP8
# using the _nvfp4_to_fp8 helper after all sub-weights load.
if (loaded_weight.dtype == torch.uint8
and param.data.dtype != torch.uint8
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
# Compressor path: unpack uint8→bf16, load into bf16 param
even_idx = (loaded_weight & 0x0F).int()
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
even_vals = E2M1_LUT[even_idx]
odd_vals = E2M1_LUT[odd_idx]
out = torch.stack([even_vals, odd_vals], dim=-1)
out = out.reshape(
loaded_weight.shape[0], -1
).to(torch.bfloat16)
loaded_weight = out'''
content = content.replace(old_stacked_unpack, new_stacked_unpack)
print("Updated stacked params unpack block")
with open(filepath, 'w') as f:
f.write(content)
print("Written to file")