tmp/fix8_final.py

#!/usr/bin/env python3
"""Add NVFP4->FP8 conversion methods to deepseek_v4.py"""

filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"

with open(filepath, 'r') as f:
    c = f.read()

# 1. Add conversion methods to DeepseekV4Model
old_finalize = '        return loaded_params\n\n\nclass DeepseekV4ForCausalLM(nn.Module):'

new_finalize = '''        return loaded_params

    def _convert_nvfp4_attention_to_fp8(self):
        """Convert NVFP4 attention weights to FP8 format.
        
        The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum
        which requires FP8 weights + weight_scale_inv. NVFP4 weights are
        incompatible. We dequantize NVFP4->bf16, then re-quantize to FP8.
        """
        E2M1_LUT = torch.tensor(
            [0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
        )
        FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
        
        attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
        shared_expert_names = {"gate_up_proj"}
        
        converted = 0
        for layer_idx, layer in enumerate(self.layers):
            attn = layer.attn
            for proj_name in attn_proj_names:
                if not hasattr(attn, proj_name):
                    continue
                mod = getattr(attn, proj_name)
                if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:
                    continue
                self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
                converted += 1
            
            ffn = layer.ffn
            if hasattr(ffn, 'shared_experts'):
                for proj_name in shared_expert_names:
                    if not hasattr(ffn.shared_experts, proj_name):
                        continue
                    mod = getattr(ffn.shared_experts, proj_name)
                    if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:
                        continue
                    self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
                    converted += 1
        
        if converted > 0:
            logger.info_once(
                "Converted %d NVFP4 attention/shared-expert layers to FP8",
                converted,
            )
    
    def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
        """Convert a single NVFP4 Linear module to FP8 format."""
        w_uint8 = mod.weight.data
        device = w_uint8.device
        
        # Unpack uint8 -> E2M1 FP4 -> bf16
        even_idx = (w_uint8 & 0x0F).int()
        odd_idx = ((w_uint8 >> 4) & 0x0F).int()
        even_vals = e2m1_lut.to(device)[even_idx]
        odd_vals = e2m1_lut.to(device)[odd_idx]
        w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
        w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
        
        # Dequantize: bf16 = fp4 * block_scale * global_scale * input_scale
        if hasattr(mod, 'weight_scale') and hasattr(mod, 'weight_scale_2'):
            block_scale = mod.weight_scale.data.to(torch.float32)
            if block_scale.dim() == 2 and w_bf16.dim() == 2:
                block_size = w_bf16.shape[1] // block_scale.shape[1]
                block_scale_expanded = block_scale.unsqueeze(-1).expand(
                    -1, -1, block_size
                ).reshape(w_bf16.shape)
            else:
                block_scale_expanded = block_scale
            global_scale = mod.weight_scale_2.data.max().item()
            input_scale = mod.input_scale.data.max().item() if hasattr(mod, 'input_scale') else 1.0
            w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
            w_dequant = w_dequant.to(torch.bfloat16)
        else:
            w_dequant = w_bf16
        
        # Re-quantize bf16 -> FP8 e4m3
        w_amax = w_dequant.abs().amax()
        if w_amax == 0:
            w_amax = torch.tensor(1.0, device=device)
        fp8_scale = w_amax / fp8_max
        w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
        weight_scale_inv = fp8_scale.to(torch.float32)
        
        # Replace weight param
        mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
        mod.weight_scale_inv = torch.nn.Parameter(
            weight_scale_inv.reshape(1), requires_grad=False
        )
        
        # Switch quant method to FP8 linear
        from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
        from vllm.model_executor.layers.quantization.utils.quant_utils import (
            Fp8MMQuantMethod,
        )
        mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())
        
        # Clean up NVFP4 params
        for attr in ('weight_scale', 'weight_scale_2', 'input_scale'):
            if hasattr(mod, attr):
                delattr(mod, attr)


class DeepseekV4ForCausalLM(nn.Module):'''

c = c.replace(old_finalize, new_finalize)

# 2. Call it from DeepseekV4ForCausalLM.load_weights
old_causal = '''        self.model.finalize_mega_moe_weights()
        return loaded_params'''

new_causal = '''        self.model.finalize_mega_moe_weights()
        # Convert NVFP4 attention weights to FP8 for compatibility with
        # the deepseek_v4_fp8_einsum kernel used in the attention forward
        self.model._convert_nvfp4_attention_to_fp8()
        return loaded_params'''

c = c.replace(old_causal, new_causal)

with open(filepath, 'w') as f:
    f.write(c)

print("Applied NVFP4->FP8 conversion methods")
Update MEMORY.md and memory files with vLLM NVFP4 serving progress Server running on B200 port 8000 with full NVFP4→vLLM bridge. All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values. 2026-05-11 02:02:14 +00:00			`#!/usr/bin/env python3`
			`"""Add NVFP4->FP8 conversion methods to deepseek_v4.py"""`

			`filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"`

			`with open(filepath, 'r') as f:`
			`c = f.read()`

			`# 1. Add conversion methods to DeepseekV4Model`
			`old_finalize = ' return loaded_params\n\n\nclass DeepseekV4ForCausalLM(nn.Module):'`

			`new_finalize = ''' return loaded_params`

			`def _convert_nvfp4_attention_to_fp8(self):`
			`"""Convert NVFP4 attention weights to FP8 format.`

			`The vLLM DeepSeekV4 attention forward uses deepseek_v4_fp8_einsum`
			`which requires FP8 weights + weight_scale_inv. NVFP4 weights are`
			`incompatible. We dequantize NVFP4->bf16, then re-quantize to FP8.`
			`"""`
			`E2M1_LUT = torch.tensor(`
			`[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16`
			`)`
			`FP8_MAX = torch.finfo(torch.float8_e4m3fn).max`

			`attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}`
			`shared_expert_names = {"gate_up_proj"}`

			`converted = 0`
			`for layer_idx, layer in enumerate(self.layers):`
			`attn = layer.attn`
			`for proj_name in attn_proj_names:`
			`if not hasattr(attn, proj_name):`
			`continue`
			`mod = getattr(attn, proj_name)`
			`if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:`
			`continue`
			`self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)`
			`converted += 1`

			`ffn = layer.ffn`
			`if hasattr(ffn, 'shared_experts'):`
			`for proj_name in shared_expert_names:`
			`if not hasattr(ffn.shared_experts, proj_name):`
			`continue`
			`mod = getattr(ffn.shared_experts, proj_name)`
			`if not hasattr(mod, 'weight') or mod.weight.dtype != torch.uint8:`
			`continue`
			`self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)`
			`converted += 1`

			`if converted > 0:`
			`logger.info_once(`
			`"Converted %d NVFP4 attention/shared-expert layers to FP8",`
			`converted,`
			`)`

			`def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):`
			`"""Convert a single NVFP4 Linear module to FP8 format."""`
			`w_uint8 = mod.weight.data`
			`device = w_uint8.device`

			`# Unpack uint8 -> E2M1 FP4 -> bf16`
			`even_idx = (w_uint8 & 0x0F).int()`
			`odd_idx = ((w_uint8 >> 4) & 0x0F).int()`
			`even_vals = e2m1_lut.to(device)[even_idx]`
			`odd_vals = e2m1_lut.to(device)[odd_idx]`
			`w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)`
			`w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)`

			`# Dequantize: bf16 = fp4 * block_scale * global_scale * input_scale`
			`if hasattr(mod, 'weight_scale') and hasattr(mod, 'weight_scale_2'):`
			`block_scale = mod.weight_scale.data.to(torch.float32)`
			`if block_scale.dim() == 2 and w_bf16.dim() == 2:`
			`block_size = w_bf16.shape[1] // block_scale.shape[1]`
			`block_scale_expanded = block_scale.unsqueeze(-1).expand(`
			`-1, -1, block_size`
			`).reshape(w_bf16.shape)`
			`else:`
			`block_scale_expanded = block_scale`
			`global_scale = mod.weight_scale_2.data.max().item()`
			`input_scale = mod.input_scale.data.max().item() if hasattr(mod, 'input_scale') else 1.0`
			`w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale`
			`w_dequant = w_dequant.to(torch.bfloat16)`
			`else:`
			`w_dequant = w_bf16`

			`# Re-quantize bf16 -> FP8 e4m3`
			`w_amax = w_dequant.abs().amax()`
			`if w_amax == 0:`
			`w_amax = torch.tensor(1.0, device=device)`
			`fp8_scale = w_amax / fp8_max`
			`w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)`
			`weight_scale_inv = fp8_scale.to(torch.float32)`

			`# Replace weight param`
			`mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)`
			`mod.weight_scale_inv = torch.nn.Parameter(`
			`weight_scale_inv.reshape(1), requires_grad=False`
			`)`

			`# Switch quant method to FP8 linear`
			`from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod`
			`from vllm.model_executor.layers.quantization.utils.quant_utils import (`
			`Fp8MMQuantMethod,`
			`)`
			`mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())`

			`# Clean up NVFP4 params`
			`for attr in ('weight_scale', 'weight_scale_2', 'input_scale'):`
			`if hasattr(mod, attr):`
			`delattr(mod, attr)`


			`class DeepseekV4ForCausalLM(nn.Module):'''`

			`c = c.replace(old_finalize, new_finalize)`

			`# 2. Call it from DeepseekV4ForCausalLM.load_weights`
			`old_causal = ''' self.model.finalize_mega_moe_weights()`
			`return loaded_params'''`

			`new_causal = ''' self.model.finalize_mega_moe_weights()`
			`# Convert NVFP4 attention weights to FP8 for compatibility with`
			`# the deepseek_v4_fp8_einsum kernel used in the attention forward`
			`self.model._convert_nvfp4_attention_to_fp8()`
			`return loaded_params'''`

			`c = c.replace(old_causal, new_causal)`

			`with open(filepath, 'w') as f:`
			`f.write(c)`

			`print("Applied NVFP4->FP8 conversion methods")`