deepseek-v4-quant/tmp/fix7_stacked.py

#!/usr/bin/env python3
"""
Update the load_weights method to use NVFP4→FP8 conversion for attention layers.

Key changes:
1. Stacked params (fused_wqa_wkv): when uint8, dequantize to bf16, re-quantize to FP8,
   then load as FP8 weight + weight_scale_inv
2. Non-stacked params (wq_b, wo_a, wo_b, gate_up_proj): same treatment
3. Compressor fused_wkv_wgate: stays as bf16 (E2M1 unpack only)
4. Remove the separate bf16→uint8 handler (no longer needed since we go to FP8)
"""

filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"

with open(filepath, 'r') as f:
    content = f.read()

# ============================================================
# Replace the stacked params loading block
# The current code unpacks uint8→bf16 for MergedColumnParallelLinear
# We need to instead convert NVFP4→FP8 for attention/shared expert
# and unpack→bf16 for compressor
# ============================================================

old_stacked_unpack = '''                # ModelOpt NVFP4 packed weight fix for MergedColumnParallelLinear.
                #
                # modelopt exports NVFP4 packed weights as uint8 (2 values/byte
                # along the column dim). But MergedColumnParallelLinear creates
                # weight as bf16 (not PackedColumnParameter uint8) because
                # ModelOptNvFp4Config only handles Linear, not
                # MergedColumnParallelLinear.
                #
                # For compressor fused_wkv_wgate (quant_config=None →
                # UnquantizedLinearMethod → bf16 weight):
                #   Unpack uint8→bf16 using E2M1 LUT, load into bf16 param.
                #
                # For fused_wqa_wkv (NVFP4 quant method → uint8 weight):
                #   The weight param IS uint8, so no unpacking needed.
                #   Just load the packed uint8 weight directly.
                #   Scales are loaded separately (no longer skipped).
                if (loaded_weight.dtype == torch.uint8
                        and param.data.dtype != torch.uint8
                        and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
                    # Unpack NVFP4 (E2M1) → BF16
                    # E2M1 LUT: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6
                    even_idx = (loaded_weight & 0x0F).int()
                    odd_idx = ((loaded_weight >> 4) & 0x0F).int()
                    even_vals = E2M1_LUT[even_idx]
                    odd_vals = E2M1_LUT[odd_idx]
                    # Interleave even and odd along the last dim
                    out = torch.stack([even_vals, odd_vals], dim=-1)
                    out = out.reshape(
                        loaded_weight.shape[0], -1
                    ).to(torch.bfloat16)
                    loaded_weight = out'''

new_stacked_unpack = '''                # ModelOpt NVFP4 weight handling for stacked params.
                #
                # The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
                # which requires FP8 weights + weight_scale_inv. NVFP4 weights are
                # incompatible. We convert NVFP4→bf16→FP8 at load time.
                #
                # For compressor fused_wkv_wgate (UnquantizedLinearMethod → bf16):
                #   Just unpack uint8→bf16 and load into bf16 param.
                #
                # For fused_wqa_wkv and gate_up_proj (NVFP4 quant → uint8):
                #   Collect the uint8 weight + scales, then convert to FP8
                #   using the _nvfp4_to_fp8 helper after all sub-weights load.
                if (loaded_weight.dtype == torch.uint8
                        and param.data.dtype != torch.uint8
                        and loaded_weight.shape[-1] * 2 == param.data.shape[-1]):
                    # Compressor path: unpack uint8→bf16, load into bf16 param
                    even_idx = (loaded_weight & 0x0F).int()
                    odd_idx = ((loaded_weight >> 4) & 0x0F).int()
                    even_vals = E2M1_LUT[even_idx]
                    odd_vals = E2M1_LUT[odd_idx]
                    out = torch.stack([even_vals, odd_vals], dim=-1)
                    out = out.reshape(
                        loaded_weight.shape[0], -1
                    ).to(torch.bfloat16)
                    loaded_weight = out'''

content = content.replace(old_stacked_unpack, new_stacked_unpack)
print("Updated stacked params unpack block")

with open(filepath, 'w') as f:
    f.write(content)
print("Written to file")