tmp/fix10_quant_method.py

#!/usr/bin/env python3
"""Fix the FP8 conversion to use a simple no-op quant method for attention layers."""

filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"

with open(filepath, 'r') as f:
    c = f.read()

# Replace all instances of Fp8LinearMethod/Fp8MMQuantMethod imports and usage
# with a simpler approach: just set quant_method to None and handle it

# In _convert_nvfp4_module_to_fp8
old_fp8_convert = '''        # Switch quant method to FP8 linear
        from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
        from vllm.model_executor.layers.quantization.utils.quant_utils import (
            Fp8MMQuantMethod,
        )
        mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''

new_fp8_convert = '''        # Switch quant method to a no-op. The attention forward uses
        # deepseek_v4_fp8_einsum directly (not the quant method), so the
        # quant method is irrelevant. We just need process_weights_after_loading
        # to not crash. Using UnquantizedLinearMethod as a safe no-op.
        from vllm.model_executor.layers.linear import UnquantizedLinearMethod
        mod.quant_method = UnquantizedLinearMethod()'''

c = c.replace(old_fp8_convert, new_fp8_convert)

# In the bf16->uint8 handler (o_a_proj case)
old_oa_fp8 = '''                        # Switch quant method to FP8 linear
                        from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
                        from vllm.model_executor.layers.quantization.utils.quant_utils import (
                            Fp8MMQuantMethod,
                        )
                        mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''

new_oa_fp8 = '''                        # Switch quant method to no-op (attention forward bypasses it)
                        from vllm.model_executor.layers.linear import UnquantizedLinearMethod
                        mod.quant_method = UnquantizedLinearMethod()'''

c = c.replace(old_oa_fp8, new_oa_fp8)

with open(filepath, 'w') as f:
    f.write(c)

print("Replaced Fp8LinearMethod with UnquantizedLinearMethod for attention")
Update MEMORY.md and memory files with vLLM NVFP4 serving progress Server running on B200 port 8000 with full NVFP4→vLLM bridge. All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values. 2026-05-11 02:02:14 +00:00			`#!/usr/bin/env python3`
			`"""Fix the FP8 conversion to use a simple no-op quant method for attention layers."""`

			`filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"`

			`with open(filepath, 'r') as f:`
			`c = f.read()`

			`# Replace all instances of Fp8LinearMethod/Fp8MMQuantMethod imports and usage`
			`# with a simpler approach: just set quant_method to None and handle it`

			`# In _convert_nvfp4_module_to_fp8`
			`old_fp8_convert = ''' # Switch quant method to FP8 linear`
			`from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod`
			`from vllm.model_executor.layers.quantization.utils.quant_utils import (`
			`Fp8MMQuantMethod,`
			`)`
			`mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''`

			`new_fp8_convert = ''' # Switch quant method to a no-op. The attention forward uses`
			`# deepseek_v4_fp8_einsum directly (not the quant method), so the`
			`# quant method is irrelevant. We just need process_weights_after_loading`
			`# to not crash. Using UnquantizedLinearMethod as a safe no-op.`
			`from vllm.model_executor.layers.linear import UnquantizedLinearMethod`
			`mod.quant_method = UnquantizedLinearMethod()'''`

			`c = c.replace(old_fp8_convert, new_fp8_convert)`

			`# In the bf16->uint8 handler (o_a_proj case)`
			`old_oa_fp8 = ''' # Switch quant method to FP8 linear`
			`from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod`
			`from vllm.model_executor.layers.quantization.utils.quant_utils import (`
			`Fp8MMQuantMethod,`
			`)`
			`mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''`

			`new_oa_fp8 = ''' # Switch quant method to no-op (attention forward bypasses it)`
			`from vllm.model_executor.layers.linear import UnquantizedLinearMethod`
			`mod.quant_method = UnquantizedLinearMethod()'''`

			`c = c.replace(old_oa_fp8, new_oa_fp8)`

			`with open(filepath, 'w') as f:`
			`f.write(c)`

			`print("Replaced Fp8LinearMethod with UnquantizedLinearMethod for attention")`