Files
deepseek-v4-quant/patches/patch_vllm_weights.py
biondizzle 6fd03a0aa0 vLLM serving: patched deepseek_v4.py, disabled mega_moe, updated docs
- Add patches/deepseek_v4.py: patched vllm source file with modelopt NVFP4
  weight name mappings (expert gate_proj→w1, mlp→ffn, self_attn→attn.mla_attn,
  compressor.kv_proj→wkv, etc.), E2M1 FP4→BF16 unpacking for stacked params,
  skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear, and
  resilient loading for unknown params.

- Update docker-compose.yml: copy patched deepseek_v4.py over original at
  container startup, remove --moe-backend=deep_gemm_mega_moe (no NVFP4 kernel).

- Update patches/patch_vllm_weights.py: legacy runtime monkey-patch approach
  (doesn't work with worker processes), kept for reference.

- Update README.md: added vLLM serving run history table (S1-S10), documented
  all open issues (MergedColumnParallelLinear+NVFP4, no mega_moe kernel,
  resilient loading), added vLLM-specific bug list and key notes.

- Update scripts/serve_vllm.py: add WARN comment on mega_moe flag.
2026-05-10 16:14:17 +00:00

136 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
modelopt exports weights with naming differences from what vllm's
_make_deepseek_v4_weights_mapper + load_weights code expects:
1. Expert projections: modelopt uses gate_proj/up_proj/down_proj, vllm expects w1/w3/w2
2. Shared expert projections: same gate_proj/up_proj naming, needs w1/w3 for stacking
3. Compressor projections: kv_proj→wkv, gate_proj→wgate for fused stacking
4. Attention projections: self_attn prefix, kv_proj→wkv for fused stacking, etc.
5. Expert NVFP4 scales: weight_scale_2 and input_scale have no matching mega_moe params
CRITICAL: DeepseekV4ForCausalLM.hf_to_vllm_mapper is a CLASS attribute set at
module import time. Simply patching _make_deepseek_v4_weights_mapper doesn't help
because the class already cached the old mapper. We must also update the class
attribute directly. Since expert_dtype=="fp4", __init__ doesn't recreate the mapper.
Drop into container as:
python3 /patches/patch_vllm_weights.py
Or add to docker-compose.yml command before vllm serve.
"""
import re
import sys
# Save original function BEFORE patching
_original_make_mapper = None
def make_patched_mapper(expert_dtype: str):
"""Create a WeightsMapper with modelopt NVFP4 naming patches applied."""
global _original_make_mapper
# Use the saved original, not the (possibly patched) module attribute
mapper = _original_make_mapper(expert_dtype)
# ══════════════════════════════════════════════════════════════════
# Regex mappings (applied FIRST by WeightsMapper, before substr)
# Order matters: skip patterns must come before rename patterns.
# ══════════════════════════════════════════════════════════════════
ordered_regexes = {}
# Skip expert NVFP4 scales that have no mega_moe params.
# MUST come before gate_proj→w1 etc. because after renaming,
# the key has "w1." not "gate_proj." and these patterns wouldn't match.
#
# modelopt's NVFP4 export includes weight_scale_2 (global scale) and
# input_scale (activation scale) for each expert projection. But the
# DeepseekV4MegaMoEExperts module only registers w13_weight_scale and
# w2_weight_scale (E8M0 block scales) — no weight_scale_2 or input_scale.
# Mapping to None tells WeightsMapper to skip these weights entirely.
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.weight_scale_2$")] = None
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.input_scale$")] = None
# Routed expert projections: gate_proj→w1, up_proj→w3, down_proj→w2
# We use regex (not substr) to match ONLY .experts.N. — NOT .shared_experts.
# Using substr ".down_proj." → ".w2." would also affect
# shared_experts.down_proj, breaking shared expert loading
# (vllm model uses down_proj, not w2, for shared experts).
ordered_regexes[re.compile(r"(\.experts\.\d+\.)gate_proj\.")] = r"\1w1."
ordered_regexes[re.compile(r"(\.experts\.\d+\.)up_proj\.")] = r"\1w3."
ordered_regexes[re.compile(r"(\.experts\.\d+\.)down_proj\.")] = r"\1w2."
# Preserve any existing regex mappings from the original mapper
if mapper.orig_to_new_regex:
ordered_regexes.update(mapper.orig_to_new_regex)
mapper.orig_to_new_regex = ordered_regexes
# ══════════════════════════════════════════════════════════════════
# Substr mappings (applied AFTER regex by WeightsMapper)
# ══════════════════════════════════════════════════════════════════
# 1. Attention: self_attn → attn.mla_attn mappings
# modelopt uses "self_attn" but vllm expects "attn" (mapped to "attn.mla_attn")
mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
# CRITICAL: kv_proj must map to wkv (not kv_proj) because the stacking
# code looks for "attn.wkv" to stack into fused_wqa_wkv.
mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.wkv."
mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
# Compressor: self_attn.compressor → attn.mla_attn.compressor
mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
# Compressor projection renaming for stacking:
# vllm stacks compressor.wkv + compressor.wgate → compressor.fused_wkv_wgate
# modelopt exports as compressor.kv_proj and compressor.gate_proj
mapper.orig_to_new_substr[".compressor.kv_proj."] = ".compressor.wkv."
mapper.orig_to_new_substr[".compressor.gate_proj."] = ".compressor.wgate."
# 2. Shared expert projections: gate_proj→w1, up_proj→w3
# vllm stacks shared_experts.w1 + shared_experts.w3 into
# shared_experts.gate_up_proj. modelopt uses gate_proj/up_proj naming.
# down_proj stays as-is (vllm model uses down_proj directly).
mapper.orig_to_new_substr[".shared_experts.gate_proj."] = ".shared_experts.w1."
mapper.orig_to_new_substr[".shared_experts.up_proj."] = ".shared_experts.w3."
return mapper
def patch():
global _original_make_mapper
from vllm.model_executor.models import deepseek_v4
# 1. Save the original function BEFORE replacing it
_original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
# 2. Patch the function so __init__ calls also get our mapper
deepseek_v4._make_deepseek_v4_weights_mapper = make_patched_mapper
print("✓ Patched _make_deepseek_v4_weights_mapper function")
# 3. CRITICAL: Also update the CLASS attribute directly.
# DeepseekV4ForCausalLM.hf_to_vllm_mapper is set at class definition
# time (module import). Our function patch above doesn't retroactively
# update it. Since expert_dtype=="fp4", __init__ won't recreate it either.
# We MUST update the class attribute directly.
if hasattr(deepseek_v4, 'DeepseekV4ForCausalLM'):
deepseek_v4.DeepseekV4ForCausalLM.hf_to_vllm_mapper = make_patched_mapper("fp4")
print("✓ Updated DeepseekV4ForCausalLM.hf_to_vllm_mapper class attribute")
else:
print("⚠ DeepseekV4ForCausalLM not found (will be patched at import time)")
print("✓ All modelopt NVFP4 weight mapping patches applied")
if __name__ == "__main__":
patch()