#!/usr/bin/env python3 """ Patch vllm's DeepSeek V4 weight mapper to handle modelopt's NVFP4 export naming. modelopt exports weights with naming differences from what vllm's _make_deepseek_v4_weights_mapper + load_weights code expects: 1. Expert projections: modelopt uses gate_proj/up_proj/down_proj, vllm expects w1/w3/w2 2. Shared expert projections: same gate_proj/up_proj naming, needs w1/w3 for stacking 3. Compressor projections: kv_proj→wkv, gate_proj→wgate for fused stacking 4. Attention projections: self_attn prefix, kv_proj→wkv for fused stacking, etc. 5. Expert NVFP4 scales: weight_scale_2 and input_scale have no matching mega_moe params CRITICAL: DeepseekV4ForCausalLM.hf_to_vllm_mapper is a CLASS attribute set at module import time. Simply patching _make_deepseek_v4_weights_mapper doesn't help because the class already cached the old mapper. We must also update the class attribute directly. Since expert_dtype=="fp4", __init__ doesn't recreate the mapper. Drop into container as: python3 /patches/patch_vllm_weights.py Or add to docker-compose.yml command before vllm serve. """ import re import sys # Save original function BEFORE patching _original_make_mapper = None def make_patched_mapper(expert_dtype: str): """Create a WeightsMapper with modelopt NVFP4 naming patches applied.""" global _original_make_mapper # Use the saved original, not the (possibly patched) module attribute mapper = _original_make_mapper(expert_dtype) # ══════════════════════════════════════════════════════════════════ # Regex mappings (applied FIRST by WeightsMapper, before substr) # Order matters: skip patterns must come before rename patterns. # ══════════════════════════════════════════════════════════════════ ordered_regexes = {} # Skip expert NVFP4 scales that have no mega_moe params. # MUST come before gate_proj→w1 etc. because after renaming, # the key has "w1." not "gate_proj." and these patterns wouldn't match. # # modelopt's NVFP4 export includes weight_scale_2 (global scale) and # input_scale (activation scale) for each expert projection. But the # DeepseekV4MegaMoEExperts module only registers w13_weight_scale and # w2_weight_scale (E8M0 block scales) — no weight_scale_2 or input_scale. # Mapping to None tells WeightsMapper to skip these weights entirely. ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.weight_scale_2$")] = None ordered_regexes[re.compile(r"\.experts\.\d+\.\w+_proj\.input_scale$")] = None # Routed expert projections: gate_proj→w1, up_proj→w3, down_proj→w2 # We use regex (not substr) to match ONLY .experts.N. — NOT .shared_experts. # Using substr ".down_proj." → ".w2." would also affect # shared_experts.down_proj, breaking shared expert loading # (vllm model uses down_proj, not w2, for shared experts). ordered_regexes[re.compile(r"(\.experts\.\d+\.)gate_proj\.")] = r"\1w1." ordered_regexes[re.compile(r"(\.experts\.\d+\.)up_proj\.")] = r"\1w3." ordered_regexes[re.compile(r"(\.experts\.\d+\.)down_proj\.")] = r"\1w2." # Preserve any existing regex mappings from the original mapper if mapper.orig_to_new_regex: ordered_regexes.update(mapper.orig_to_new_regex) mapper.orig_to_new_regex = ordered_regexes # ══════════════════════════════════════════════════════════════════ # Substr mappings (applied AFTER regex by WeightsMapper) # ══════════════════════════════════════════════════════════════════ # 1. Attention: self_attn → attn.mla_attn mappings # modelopt uses "self_attn" but vllm expects "attn" (mapped to "attn.mla_attn") mapper.orig_to_new_substr[".self_attn.q_a_proj."] = ".attn.mla_attn.wq_a." mapper.orig_to_new_substr[".self_attn.q_b_proj."] = ".attn.mla_attn.wq_b." mapper.orig_to_new_substr[".self_attn.q_a_norm."] = ".attn.mla_attn.q_norm." mapper.orig_to_new_substr[".self_attn.o_a_proj."] = ".attn.mla_attn.wo_a." mapper.orig_to_new_substr[".self_attn.o_b_proj."] = ".attn.mla_attn.wo_b." mapper.orig_to_new_substr[".self_attn.sinks"] = ".attn.mla_attn.attn_sink" # CRITICAL: kv_proj must map to wkv (not kv_proj) because the stacking # code looks for "attn.wkv" to stack into fused_wqa_wkv. mapper.orig_to_new_substr[".self_attn.kv_proj."] = ".attn.mla_attn.wkv." mapper.orig_to_new_substr[".self_attn.kv_norm."] = ".attn.mla_attn.kv_norm." # Compressor: self_attn.compressor → attn.mla_attn.compressor mapper.orig_to_new_substr[".self_attn.compressor."] = ".attn.mla_attn.compressor." # Compressor projection renaming for stacking: # vllm stacks compressor.wkv + compressor.wgate → compressor.fused_wkv_wgate # modelopt exports as compressor.kv_proj and compressor.gate_proj mapper.orig_to_new_substr[".compressor.kv_proj."] = ".compressor.wkv." mapper.orig_to_new_substr[".compressor.gate_proj."] = ".compressor.wgate." # 2. Shared expert projections: gate_proj→w1, up_proj→w3 # vllm stacks shared_experts.w1 + shared_experts.w3 into # shared_experts.gate_up_proj. modelopt uses gate_proj/up_proj naming. # down_proj stays as-is (vllm model uses down_proj directly). mapper.orig_to_new_substr[".shared_experts.gate_proj."] = ".shared_experts.w1." mapper.orig_to_new_substr[".shared_experts.up_proj."] = ".shared_experts.w3." return mapper def patch(): global _original_make_mapper from vllm.model_executor.models import deepseek_v4 # 1. Save the original function BEFORE replacing it _original_make_mapper = deepseek_v4._make_deepseek_v4_weights_mapper # 2. Patch the function so __init__ calls also get our mapper deepseek_v4._make_deepseek_v4_weights_mapper = make_patched_mapper print("✓ Patched _make_deepseek_v4_weights_mapper function") # 3. CRITICAL: Also update the CLASS attribute directly. # DeepseekV4ForCausalLM.hf_to_vllm_mapper is set at class definition # time (module import). Our function patch above doesn't retroactively # update it. Since expert_dtype=="fp4", __init__ won't recreate it either. # We MUST update the class attribute directly. if hasattr(deepseek_v4, 'DeepseekV4ForCausalLM'): deepseek_v4.DeepseekV4ForCausalLM.hf_to_vllm_mapper = make_patched_mapper("fp4") print("✓ Updated DeepseekV4ForCausalLM.hf_to_vllm_mapper class attribute") else: print("⚠ DeepseekV4ForCausalLM not found (will be patched at import time)") print("✓ All modelopt NVFP4 weight mapping patches applied") if __name__ == "__main__": patch()