deepseek-v4-quant/tmp/fix6_nvfp4_to_fp8.py

#!/usr/bin/env python3
"""
Replace the current NVFP4 weight handling with a comprehensive
NVFP4→bf16→FP8 re-quantization path for attention layers.

The vLLM DeepSeekV4 attention forward code uses deepseek_v4_fp8_einsum
which requires FP8 weights + weight_scale_inv. NVFP4 weights (uint8 packed
FP4 + per-block scales + per-tensor scales) are incompatible with this kernel.

Solution: At load time, dequantize all NVFP4 attention weights to bf16,
then re-quantize to FP8. Store the FP8 weights + weight_scale_inv.
The existing FP8 attention forward code then works without modification.

For compressor fused_wkv_wgate: stays bf16 (UnquantizedLinearMethod).
For MoE experts: handled by ModelOptNvFp4FusedMoE natively.
For shared experts gate_up_proj: also needs FP8 conversion.
"""

filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"

with open(filepath, 'r') as f:
    content = f.read()

# ============================================================
# Helper function: add the NVFP4→FP8 conversion utility
# at the top of the load_weights method
# ============================================================

old_load_weights_start = '''    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        # Define E2M1 FP4 → BF16 lookup table for unpacking
        E2M1_LUT = torch.tensor(
            [0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
        )'''

new_load_weights_start = '''    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        # Define E2M1 FP4 → BF16 lookup table for unpacking
        E2M1_LUT = torch.tensor(
            [0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
        )

        # ── NVFP4 → FP8 re-quantization helper ──────────────────
        # The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
        # which requires FP8 weights + weight_scale_inv. ModelOpt NVFP4
        # stores uint8 packed FP4 + per-block/per-tensor scales.
        # We dequantize NVFP4→bf16, then re-quantize to FP8 at load time.
        def _nvfp4_to_fp8(w_uint8, w_scale, w_scale_2, w_input_scale):
            """Convert NVFP4 weight + scales to FP8 weight + weight_scale_inv."""
            # Unpack uint8 → E2M1 FP4 → bf16
            even = (w_uint8 & 0x0F).int()
            odd = ((w_uint8 >> 4) & 0x0F).int()
            # Interleave back
            bf16_even = E2M1_LUT.to(w_uint8.device)[even]
            bf16_odd = E2M1_LUT.to(w_uint8.device)[odd]
            # Stack along last dim and flatten
            w_bf16 = torch.stack([bf16_even, bf16_odd], dim=-1)
            w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1)  # [out, in_dim]

            # Dequantize: bf16_val = fp4 * block_scale * global_scale * input_scale
            if w_scale.dim() == 2:
                block_scale = w_scale.to(torch.float32).unsqueeze(-1)  # [out, blocks, 1]
                w_bf16_scaled = w_bf16.float() * block_scale.reshape(
                    w_bf16.shape[0], -1) * w_scale_2.item() * w_input_scale.item()
            else:
                w_bf16_scaled = w_bf16.float() * w_scale_2.item() * w_input_scale.item()
            w_bf16_scaled = w_bf16_scaled.to(torch.bfloat16)

            # Re-quantize bf16 → FP8 e4m3
            w_amax = w_bf16_scaled.abs().amax()
            fp8_scale = w_amax / torch.finfo(torch.float8_e4m3fn).max
            w_fp8 = (w_bf16_scaled / fp8_scale).to(torch.float8_e4m3fn)
            weight_scale_inv = fp8_scale.to(torch.float32)
            return w_fp8, weight_scale_inv
        # ── End helper ──────────────────────────────────────────
        '''

content = content.replace(old_load_weights_start, new_load_weights_start)
print("Added NVFP4→FP8 helper function")

with open(filepath, 'w') as f:
    f.write(content)
print("Written to file")