diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index a6d7aa16..94c183db 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -1270,16 +1270,26 @@ class DeepseekV4Model(nn.Module): ".self_attn.sinks": ".attn.attn_sink", ".self_attn.kv_proj.": ".attn.wkv.", ".self_attn.kv_norm.": ".attn.kv_norm.", + # Indexer: self_attn.compressor.indexer → attn.indexer + # MUST come before the generic .self_attn.compressor. rule + ".self_attn.compressor.indexer.q_b_proj.": ".attn.indexer.wq_b.", + ".self_attn.compressor.indexer.kv_norm.": ".attn.indexer.k_norm.", + ".self_attn.compressor.indexer.position_bias": ".attn.indexer.compressor.ape", + ".self_attn.compressor.indexer.gate_proj.": ".attn.indexer.compressor.wgate.", + ".self_attn.compressor.indexer.kv_proj.": ".attn.indexer.compressor.wkv.", + ".self_attn.compressor.indexer.": ".attn.indexer.", # Compressor: self_attn.compressor → attn.mla_attn.compressor + # Compressor projections for stacking (fused_wkv_wgate) + ".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.", + ".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.", ".self_attn.compressor.kv_norm.": ".attn.kv_norm.", ".self_attn.compressor.": ".attn.mla_attn.compressor.", - # Compressor projections for stacking (fused_wkv_wgate) - ".compressor.kv_proj.": ".compressor.wkv.", - ".compressor.gate_proj.": ".compressor.gate.", # Shared expert projections (stacking into gate_up_proj) - # Checkpoint has .shared_experts. but model has .ffn.shared_experts. - ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", - ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", + # Must include .mlp. prefix since break prevents .mlp.→.ffn. from + # firing on the same key after these patterns match. + ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.", + ".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", }