Server running on B200 port 8000 with full NVFP4→vLLM bridge. All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values.
89 lines
4.4 KiB
Python
89 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Update the load_weights method to use NVFP4→FP8 conversion for attention layers.
|
|
|
|
Key changes:
|
|
1. Stacked params (fused_wqa_wkv): when uint8, dequantize to bf16, re-quantize to FP8,
|
|
then load as FP8 weight + weight_scale_inv
|
|
2. Non-stacked params (wq_b, wo_a, wo_b, gate_up_proj): same treatment
|
|
3. Compressor fused_wkv_wgate: stays as bf16 (E2M1 unpack only)
|
|
4. Remove the separate bf16→uint8 handler (no longer needed since we go to FP8)
|
|
"""
|
|
|
|
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
|
|
|
with open(filepath, 'r') as f:
|
|
content = f.read()
|
|
|
|
# ============================================================
|
|
# Replace the stacked params loading block
|
|
# The current code unpacks uint8→bf16 for MergedColumnParallelLinear
|
|
# We need to instead convert NVFP4→FP8 for attention/shared expert
|
|
# and unpack→bf16 for compressor
|
|
# ============================================================
|
|
|
|
old_stacked_unpack = ''' # ModelOpt NVFP4 packed weight fix for MergedColumnParallelLinear.
|
|
#
|
|
# modelopt exports NVFP4 packed weights as uint8 (2 values/byte
|
|
# along the column dim). But MergedColumnParallelLinear creates
|
|
# weight as bf16 (not PackedColumnParameter uint8) because
|
|
# ModelOptNvFp4Config only handles Linear, not
|
|
# MergedColumnParallelLinear.
|
|
#
|
|
# For compressor fused_wkv_wgate (quant_config=None →
|
|
# UnquantizedLinearMethod → bf16 weight):
|
|
# Unpack uint8→bf16 using E2M1 LUT, load into bf16 param.
|
|
#
|
|
# For fused_wqa_wkv (NVFP4 quant method → uint8 weight):
|
|
# The weight param IS uint8, so no unpacking needed.
|
|
# Just load the packed uint8 weight directly.
|
|
# Scales are loaded separately (no longer skipped).
|
|
if (loaded_weight.dtype == torch.uint8
|
|
and param.data.dtype != torch.uint8
|
|
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
|
|
# Unpack NVFP4 (E2M1) → BF16
|
|
# E2M1 LUT: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
|
|
even_idx = (loaded_weight & 0x0F).int()
|
|
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
|
|
even_vals = E2M1_LUT[even_idx]
|
|
odd_vals = E2M1_LUT[odd_idx]
|
|
# Interleave even and odd along the last dim
|
|
out = torch.stack([even_vals, odd_vals], dim=-1)
|
|
out = out.reshape(
|
|
loaded_weight.shape[0], -1
|
|
).to(torch.bfloat16)
|
|
loaded_weight = out'''
|
|
|
|
new_stacked_unpack = ''' # ModelOpt NVFP4 weight handling for stacked params.
|
|
#
|
|
# The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
|
|
# which requires FP8 weights + weight_scale_inv. NVFP4 weights are
|
|
# incompatible. We convert NVFP4→bf16→FP8 at load time.
|
|
#
|
|
# For compressor fused_wkv_wgate (UnquantizedLinearMethod → bf16):
|
|
# Just unpack uint8→bf16 and load into bf16 param.
|
|
#
|
|
# For fused_wqa_wkv and gate_up_proj (NVFP4 quant → uint8):
|
|
# Collect the uint8 weight + scales, then convert to FP8
|
|
# using the _nvfp4_to_fp8 helper after all sub-weights load.
|
|
if (loaded_weight.dtype == torch.uint8
|
|
and param.data.dtype != torch.uint8
|
|
and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
|
|
# Compressor path: unpack uint8→bf16, load into bf16 param
|
|
even_idx = (loaded_weight & 0x0F).int()
|
|
odd_idx = ((loaded_weight >> 4) & 0x0F).int()
|
|
even_vals = E2M1_LUT[even_idx]
|
|
odd_vals = E2M1_LUT[odd_idx]
|
|
out = torch.stack([even_vals, odd_vals], dim=-1)
|
|
out = out.reshape(
|
|
loaded_weight.shape[0], -1
|
|
).to(torch.bfloat16)
|
|
loaded_weight = out'''
|
|
|
|
content = content.replace(old_stacked_unpack, new_stacked_unpack)
|
|
print("Updated stacked params unpack block")
|
|
|
|
with open(filepath, 'w') as f:
|
|
f.write(content)
|
|
print("Written to file")
|