Server running on B200 port 8000 with full NVFP4→vLLM bridge. All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values.
47 lines
2.1 KiB
Python
47 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Fix the FP8 conversion to use a simple no-op quant method for attention layers."""
|
|
|
|
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
|
|
|
|
with open(filepath, 'r') as f:
|
|
c = f.read()
|
|
|
|
# Replace all instances of Fp8LinearMethod/Fp8MMQuantMethod imports and usage
|
|
# with a simpler approach: just set quant_method to None and handle it
|
|
|
|
# In _convert_nvfp4_module_to_fp8
|
|
old_fp8_convert = ''' # Switch quant method to FP8 linear
|
|
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|
Fp8MMQuantMethod,
|
|
)
|
|
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''
|
|
|
|
new_fp8_convert = ''' # Switch quant method to a no-op. The attention forward uses
|
|
# deepseek_v4_fp8_einsum directly (not the quant method), so the
|
|
# quant method is irrelevant. We just need process_weights_after_loading
|
|
# to not crash. Using UnquantizedLinearMethod as a safe no-op.
|
|
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
|
mod.quant_method = UnquantizedLinearMethod()'''
|
|
|
|
c = c.replace(old_fp8_convert, new_fp8_convert)
|
|
|
|
# In the bf16->uint8 handler (o_a_proj case)
|
|
old_oa_fp8 = ''' # Switch quant method to FP8 linear
|
|
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|
Fp8MMQuantMethod,
|
|
)
|
|
mod.quant_method = Fp8LinearMethod(Fp8MMQuantMethod())'''
|
|
|
|
new_oa_fp8 = ''' # Switch quant method to no-op (attention forward bypasses it)
|
|
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
|
mod.quant_method = UnquantizedLinearMethod()'''
|
|
|
|
c = c.replace(old_oa_fp8, new_oa_fp8)
|
|
|
|
with open(filepath, 'w') as f:
|
|
f.write(c)
|
|
|
|
print("Replaced Fp8LinearMethod with UnquantizedLinearMethod for attention")
|