#!/usr/bin/env python3 """Fix the FP8 conversion to use a simple no-op quant method for attention layers.""" filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py" with open(filepath, 'r') as f: c = f.read() # Replace all instances of Fp8LinearMethod/Fp8MMQuantMethod imports and usage # with a simpler approach: just set quant_method to None and handle it # In _convert_nvfp4_module_to_fp8 old_fp8_convert = ''' # Switch quant method to FP8 linear from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import ( Fp8MMQuantMethod, ) mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())''' new_fp8_convert = ''' # Switch quant method to a no-op. The attention forward uses # deepseek_v4_fp8_einsum directly (not the quant method), so the # quant method is irrelevant. We just need process_weights_after_loading # to not crash. Using UnquantizedLinearMethod as a safe no-op. from vllm.model_executor.layers.linear import UnquantizedLinearMethod mod.quant_method = UnquantizedLinearMethod()''' c = c.replace(old_fp8_convert, new_fp8_convert) # In the bf16->uint8 handler (o_a_proj case) old_oa_fp8 = ''' # Switch quant method to FP8 linear from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import ( Fp8MMQuantMethod, ) mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())''' new_oa_fp8 = ''' # Switch quant method to no-op (attention forward bypasses it) from vllm.model_executor.layers.linear import UnquantizedLinearMethod mod.quant_method = UnquantizedLinearMethod()''' c = c.replace(old_oa_fp8, new_oa_fp8) with open(filepath, 'w') as f: f.write(c) print("Replaced Fp8LinearMethod with UnquantizedLinearMethod for attention")