- Add patches/deepseek_v4.py: patched vllm source file with modelopt NVFP4 weight name mappings (expert gate_proj→w1, mlp→ffn, self_attn→attn.mla_attn, compressor.kv_proj→wkv, etc.), E2M1 FP4→BF16 unpacking for stacked params, skip patterns for NVFP4 scale tensors on MergedColumnParallelLinear, and resilient loading for unknown params. - Update docker-compose.yml: copy patched deepseek_v4.py over original at container startup, remove --moe-backend=deep_gemm_mega_moe (no NVFP4 kernel). - Update patches/patch_vllm_weights.py: legacy runtime monkey-patch approach (doesn't work with worker processes), kept for reference. - Update README.md: added vLLM serving run history table (S1-S10), documented all open issues (MergedColumnParallelLinear+NVFP4, no mega_moe kernel, resilient loading), added vLLM-specific bug list and key notes. - Update scripts/serve_vllm.py: add WARN comment on mega_moe flag.
136 lines
7.0 KiB
Python
136 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming.
|
|
|
|
modelopt exports weights with naming differences from what vllm's
|
|
_make_deepseek_v4_weights_mapper + load_weights code expects:
|
|
|
|
1. Expert projections: modelopt uses gate_proj/up_proj/down_proj, vllm expects w1/w3/w2
|
|
2. Shared expert projections: same gate_proj/up_proj naming, needs w1/w3 for stacking
|
|
3. Compressor projections: kv_proj→wkv, gate_proj→wgate for fused stacking
|
|
4. Attention projections: self_attn prefix, kv_proj→wkv for fused stacking, etc.
|
|
5. Expert NVFP4 scales: weight_scale_2 and input_scale have no matching mega_moe params
|
|
|
|
CRITICAL: DeepseekV4ForCausalLM.hf_to_vllm_mapper is a CLASS attribute set at
|
|
module import time. Simply patching _make_deepseek_v4_weights_mapper doesn't help
|
|
because the class already cached the old mapper. We must also update the class
|
|
attribute directly. Since expert_dtype=="fp4", __init__ doesn't recreate the mapper.
|
|
|
|
Drop into container as:
|
|
python3 /patches/patch_vllm_weights.py
|
|
|
|
Or add to docker-compose.yml command before vllm serve.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
|
|
|
|
# Save original function BEFORE patching
|
|
_original_make_mapper = None
|
|
|
|
|
|
def make_patched_mapper(expert_dtype: str):
|
|
"""Create a WeightsMapper with modelopt NVFP4 naming patches applied."""
|
|
global _original_make_mapper
|
|
# Use the saved original, not the (possibly patched) module attribute
|
|
mapper = _original_make_mapper(expert_dtype)
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Regex mappings (applied FIRST by WeightsMapper, before substr)
|
|
# Order matters: skip patterns must come before rename patterns.
|
|
# ══════════════════════════════════════════════════════════════════
|
|
ordered_regexes = {}
|
|
|
|
# Skip expert NVFP4 scales that have no mega_moe params.
|
|
# MUST come before gate_proj→w1 etc. because after renaming,
|
|
# the key has "w1." not "gate_proj." and these patterns wouldn't match.
|
|
#
|
|
# modelopt's NVFP4 export includes weight_scale_2 (global scale) and
|
|
# input_scale (activation scale) for each expert projection. But the
|
|
# DeepseekV4MegaMoEExperts module only registers w13_weight_scale and
|
|
# w2_weight_scale (E8M0 block scales) — no weight_scale_2 or input_scale.
|
|
# Mapping to None tells WeightsMapper to skip these weights entirely.
|
|
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.weight_scale_2$")] = None
|
|
ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.input_scale$")] = None
|
|
|
|
# Routed expert projections: gate_proj→w1, up_proj→w3, down_proj→w2
|
|
# We use regex (not substr) to match ONLY .experts.N. — NOT .shared_experts.
|
|
# Using substr ".down_proj." → ".w2." would also affect
|
|
# shared_experts.down_proj, breaking shared expert loading
|
|
# (vllm model uses down_proj, not w2, for shared experts).
|
|
ordered_regexes[re.compile(r"(\.experts\.\d+\.)gate_proj\.")] = r"\1w1."
|
|
ordered_regexes[re.compile(r"(\.experts\.\d+\.)up_proj\.")] = r"\1w3."
|
|
ordered_regexes[re.compile(r"(\.experts\.\d+\.)down_proj\.")] = r"\1w2."
|
|
|
|
# Preserve any existing regex mappings from the original mapper
|
|
if mapper.orig_to_new_regex:
|
|
ordered_regexes.update(mapper.orig_to_new_regex)
|
|
|
|
mapper.orig_to_new_regex = ordered_regexes
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Substr mappings (applied AFTER regex by WeightsMapper)
|
|
# ══════════════════════════════════════════════════════════════════
|
|
|
|
# 1. Attention: self_attn → attn.mla_attn mappings
|
|
# modelopt uses "self_attn" but vllm expects "attn" (mapped to "attn.mla_attn")
|
|
mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a."
|
|
mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b."
|
|
mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm."
|
|
mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a."
|
|
mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b."
|
|
mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink"
|
|
|
|
# CRITICAL: kv_proj must map to wkv (not kv_proj) because the stacking
|
|
# code looks for "attn.wkv" to stack into fused_wqa_wkv.
|
|
mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.wkv."
|
|
mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm."
|
|
|
|
# Compressor: self_attn.compressor → attn.mla_attn.compressor
|
|
mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor."
|
|
|
|
# Compressor projection renaming for stacking:
|
|
# vllm stacks compressor.wkv + compressor.wgate → compressor.fused_wkv_wgate
|
|
# modelopt exports as compressor.kv_proj and compressor.gate_proj
|
|
mapper.orig_to_new_substr[".compressor.kv_proj."] = ".compressor.wkv."
|
|
mapper.orig_to_new_substr[".compressor.gate_proj."] = ".compressor.wgate."
|
|
|
|
# 2. Shared expert projections: gate_proj→w1, up_proj→w3
|
|
# vllm stacks shared_experts.w1 + shared_experts.w3 into
|
|
# shared_experts.gate_up_proj. modelopt uses gate_proj/up_proj naming.
|
|
# down_proj stays as-is (vllm model uses down_proj directly).
|
|
mapper.orig_to_new_substr[".shared_experts.gate_proj."] = ".shared_experts.w1."
|
|
mapper.orig_to_new_substr[".shared_experts.up_proj."] = ".shared_experts.w3."
|
|
|
|
return mapper
|
|
|
|
|
|
def patch():
|
|
global _original_make_mapper
|
|
from vllm.model_executor.models import deepseek_v4
|
|
|
|
# 1. Save the original function BEFORE replacing it
|
|
_original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper
|
|
|
|
# 2. Patch the function so __init__ calls also get our mapper
|
|
deepseek_v4._make_deepseek_v4_weights_mapper = make_patched_mapper
|
|
print("✓ Patched _make_deepseek_v4_weights_mapper function")
|
|
|
|
# 3. CRITICAL: Also update the CLASS attribute directly.
|
|
# DeepseekV4ForCausalLM.hf_to_vllm_mapper is set at class definition
|
|
# time (module import). Our function patch above doesn't retroactively
|
|
# update it. Since expert_dtype=="fp4", __init__ won't recreate it either.
|
|
# We MUST update the class attribute directly.
|
|
if hasattr(deepseek_v4, 'DeepseekV4ForCausalLM'):
|
|
deepseek_v4.DeepseekV4ForCausalLM.hf_to_vllm_mapper = make_patched_mapper("fp4")
|
|
print("✓ Updated DeepseekV4ForCausalLM.hf_to_vllm_mapper class attribute")
|
|
else:
|
|
print("⚠ DeepseekV4ForCausalLM not found (will be patched at import time)")
|
|
|
|
print("✓ All modelopt NVFP4 weight mapping patches applied")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
patch()
|