#!/usr/bin/env python3 """ Update the load_weights method to use NVFP4→FP8 conversion for attention layers. Key changes: 1. Stacked params (fused_wqa_wkv): when uint8, dequantize to bf16, re-quantize to FP8, then load as FP8 weight + weight_scale_inv 2. Non-stacked params (wq_b, wo_a, wo_b, gate_up_proj): same treatment 3. Compressor fused_wkv_wgate: stays as bf16 (E2M1 unpack only) 4. Remove the separate bf16→uint8 handler (no longer needed since we go to FP8) """ filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py" with open(filepath, 'r') as f: content = f.read() # ============================================================ # Replace the stacked params loading block # The current code unpacks uint8→bf16 for MergedColumnParallelLinear # We need to instead convert NVFP4→FP8 for attention/shared expert # and unpack→bf16 for compressor # ============================================================ old_stacked_unpack = ''' # ModelOpt NVFP4 packed weight fix for MergedColumnParallelLinear. # # modelopt exports NVFP4 packed weights as uint8 (2 values/byte # along the column dim). But MergedColumnParallelLinear creates # weight as bf16 (not PackedColumnParameter uint8) because # ModelOptNvFp4Config only handles Linear, not # MergedColumnParallelLinear. # # For compressor fused_wkv_wgate (quant_config=None → # UnquantizedLinearMethod → bf16 weight): # Unpack uint8→bf16 using E2M1 LUT, load into bf16 param. # # For fused_wqa_wkv (NVFP4 quant method → uint8 weight): # The weight param IS uint8, so no unpacking needed. # Just load the packed uint8 weight directly. # Scales are loaded separately (no longer skipped). if (loaded_weight.dtype == torch.uint8 and param.data.dtype != torch.uint8 and loaded_weight.shape[-1] * 2 == param.data.shape[-1]): # Unpack NVFP4 (E2M1) → BF16 # E2M1 LUT: 0→0, 1→0.5, 2→1, 3→1.5, 4→2, 5→3, 6→4, 7→6 even_idx = (loaded_weight & 0x0F).int() odd_idx = ((loaded_weight >> 4) & 0x0F).int() even_vals = E2M1_LUT[even_idx] odd_vals = E2M1_LUT[odd_idx] # Interleave even and odd along the last dim out = torch.stack([even_vals, odd_vals], dim=-1) out = out.reshape( loaded_weight.shape[0], -1 ).to(torch.bfloat16) loaded_weight = out''' new_stacked_unpack = ''' # ModelOpt NVFP4 weight handling for stacked params. # # The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum # which requires FP8 weights + weight_scale_inv. NVFP4 weights are # incompatible. We convert NVFP4→bf16→FP8 at load time. # # For compressor fused_wkv_wgate (UnquantizedLinearMethod → bf16): # Just unpack uint8→bf16 and load into bf16 param. # # For fused_wqa_wkv and gate_up_proj (NVFP4 quant → uint8): # Collect the uint8 weight + scales, then convert to FP8 # using the _nvfp4_to_fp8 helper after all sub-weights load. if (loaded_weight.dtype == torch.uint8 and param.data.dtype != torch.uint8 and loaded_weight.shape[-1] * 2 == param.data.shape[-1]): # Compressor path: unpack uint8→bf16, load into bf16 param even_idx = (loaded_weight & 0x0F).int() odd_idx = ((loaded_weight >> 4) & 0x0F).int() even_vals = E2M1_LUT[even_idx] odd_vals = E2M1_LUT[odd_idx] out = torch.stack([even_vals, odd_vals], dim=-1) out = out.reshape( loaded_weight.shape[0], -1 ).to(torch.bfloat16) loaded_weight = out''' content = content.replace(old_stacked_unpack, new_stacked_unpack) print("Updated stacked params unpack block") with open(filepath, 'w') as f: f.write(content) print("Written to file")