Files
deepseek-v4-quant/tmp/fix_indent.py
biondizzle 02b8ea536f Update MEMORY.md and memory files with vLLM NVFP4 serving progress
Server running on B200 port 8000 with full NVFP4→vLLM bridge.
All critical bugs fixed: DeepGEMM scale format, compressor shapes, block scale values.
2026-05-11 02:02:49 +00:00

130 lines
5.4 KiB
Python

#!/usr/bin/python3
"""Fix the placement of _convert_nvfp4 methods - move inside DeepseekV4Model"""
filepath = "/root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py"
with open(filepath, 'r') as f:
c = f.read()
# Remove the wrongly placed methods (at top level, 0 indent)
# Find and remove the block between the marker and the class definition
marker = " def _convert_nvfp4_attention_to_fp8(self):\n"
class_marker = "\nclass DeepseekV4ForCausalLM(nn.Module):"
# Find the wrongly placed methods and remove them
idx = c.find(" def _convert_nvfp4_attention_to_fp8(self):\n")
class_idx = c.find("\n\nclass DeepseekV4ForCausalLM(nn.Module):")
if idx > 0 and class_idx > 0 and idx < class_idx:
# Remove the wrongly placed methods
# Find the start of the blank lines before the methods
search_start = idx
while search_start > 0 and c[search_start-1] == '\n':
search_start -= 1
c = c[:search_start] + c[class_idx:]
print(f"Removed wrongly placed methods (chars {search_start}-{class_idx})")
else:
print(f"Could not find wrongly placed methods: idx={idx}, class_idx={class_idx}")
# Now insert the methods INSIDE DeepseekV4Model class, right before
# the line that precedes DeepseekV4ForCausalLM
# Find the last method of DeepseekV4Model before the class boundary
# Insert before "class DeepseekV4ForCausalLM"
insert_point = c.find("\n\nclass DeepseekV4ForCausalLM(nn.Module):")
if insert_point < 0:
print("ERROR: Could not find class marker")
else:
# The methods need to be at 4-space indent (class method level)
methods = '''
def _convert_nvfp4_attention_to_fp8(self):
E2M1_LUT = torch.tensor(
[0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.bfloat16
)
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
attn_proj_names = {"fused_wqa_wkv", "wq_b", "wo_a", "wo_b"}
shared_expert_names = {"gate_up_proj"}
converted = 0
for layer_idx, layer in enumerate(self.layers):
attn = layer.attn
for proj_name in attn_proj_names:
if not hasattr(attn, proj_name):
continue
mod = getattr(attn, proj_name)
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
continue
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
converted += 1
ffn = layer.ffn
if hasattr(ffn, "shared_experts"):
for proj_name in shared_expert_names:
if not hasattr(ffn.shared_experts, proj_name):
continue
mod = getattr(ffn.shared_experts, proj_name)
if not hasattr(mod, "weight") or mod.weight.dtype != torch.uint8:
continue
self._convert_nvfp4_module_to_fp8(mod, E2M1_LUT, FP8_MAX)
converted += 1
if converted > 0:
logger.info_once(
"Converted %d NVFP4 attention/shared-expert layers to FP8",
converted,
)
def _convert_nvfp4_module_to_fp8(self, mod, e2m1_lut, fp8_max):
w_uint8 = mod.weight.data
device = w_uint8.device
even_idx = (w_uint8 & 0x0F).int()
odd_idx = ((w_uint8 >> 4) & 0x0F).int()
even_vals = e2m1_lut.to(device)[even_idx]
odd_vals = e2m1_lut.to(device)[odd_idx]
w_bf16 = torch.stack([even_vals, odd_vals], dim=-1)
w_bf16 = w_bf16.reshape(w_uint8.shape[0], -1).to(torch.bfloat16)
if hasattr(mod, "weight_scale") and hasattr(mod, "weight_scale_2"):
block_scale = mod.weight_scale.data.to(torch.float32)
if block_scale.dim() == 2 and w_bf16.dim() == 2:
block_size = w_bf16.shape[1] // block_scale.shape[1]
block_scale_expanded = block_scale.unsqueeze(-1).expand(
-1, -1, block_size
).reshape(w_bf16.shape)
else:
block_scale_expanded = block_scale
global_scale = mod.weight_scale_2.data.max().item()
input_scale = (
mod.input_scale.data.max().item()
if hasattr(mod, "input_scale")
else 1.0
)
w_dequant = w_bf16.float() * block_scale_expanded * global_scale * input_scale
w_dequant = w_dequant.to(torch.bfloat16)
else:
w_dequant = w_bf16
w_amax = w_dequant.abs().amax()
if w_amax == 0:
w_amax = torch.tensor(1.0, device=device)
fp8_scale = w_amax / fp8_max
w_fp8 = (w_dequant / fp8_scale).to(torch.float8_e4m3fn)
weight_scale_inv = fp8_scale.to(torch.float32)
mod.weight = torch.nn.Parameter(w_fp8, requires_grad=False)
mod.weight_scale_inv = torch.nn.Parameter(
weight_scale_inv.reshape(1), requires_grad=False
)
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
mod.quant_method = UnquantizedLinearMethod()
for attr in ("weight_scale", "weight_scale_2", "input_scale"):
if hasattr(mod, attr):
delattr(mod, attr)
'''
c = c[:insert_point] + methods + c[insert_point:]
print("Inserted methods at correct indentation level")
import ast
try:
ast.parse(c)
print("Syntax OK")
except SyntaxError as e:
print(f"Syntax error: {e}")
with open(filepath, 'w') as f:
f.write(c)