From e6ed9facf3fa4c27578085eaf296a4158b3afb35 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 15 May 2026 00:39:37 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20indexer=20+=20shared=5Fexperts=20+=20com?= =?UTF-8?q?pressor=20checkpoint=E2=86=92model=20key=20renames?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three categories of missed renames in CKPT_KEY_SUBST: 1. Shared experts: .shared_experts.gate_proj.→.ffn.shared_experts.w1. fired but break prevented .mlp.→.ffn. from also applying, producing mlp.ffn.shared_experts.w1. (double prefix). Fixed by including .mlp. in the pattern. Added missing .shared_experts.down_proj. rule. 2. Indexer (layers 2+): .self_attn.compressor.indexer.* was caught by the generic .self_attn.compressor.→.attn.mla_attn.compressor. rule, producing wrong path attn.mla_attn.compressor.indexer.* instead of attn.indexer.*. Added indexer-specific patterns (q_b_proj→wq_b, kv_norm→k_norm, position_bias→compressor.ape, gate_proj→compressor.wgate, kv_proj→compressor.wkv) before the generic compressor rule. 3. Compressor kv_proj/gate_proj: old .compressor.kv_proj.→.compressor.wkv. pattern could never fire because .self_attn.compressor. matched first (break). Merged into combined patterns that handle both the self_attn.compressor→attn.mla_attn.compressor path AND the projection rename in one step. --- vllm/patches/deepseek_v4.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index a6d7aa16..94c183db 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -1270,16 +1270,26 @@ class DeepseekV4Model(nn.Module): ".self_attn.sinks": ".attn.attn_sink", ".self_attn.kv_proj.": ".attn.wkv.", ".self_attn.kv_norm.": ".attn.kv_norm.", + # Indexer: self_attn.compressor.indexer → attn.indexer + # MUST come before the generic .self_attn.compressor. rule + ".self_attn.compressor.indexer.q_b_proj.": ".attn.indexer.wq_b.", + ".self_attn.compressor.indexer.kv_norm.": ".attn.indexer.k_norm.", + ".self_attn.compressor.indexer.position_bias": ".attn.indexer.compressor.ape", + ".self_attn.compressor.indexer.gate_proj.": ".attn.indexer.compressor.wgate.", + ".self_attn.compressor.indexer.kv_proj.": ".attn.indexer.compressor.wkv.", + ".self_attn.compressor.indexer.": ".attn.indexer.", # Compressor: self_attn.compressor → attn.mla_attn.compressor + # Compressor projections for stacking (fused_wkv_wgate) + ".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.", + ".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.", ".self_attn.compressor.kv_norm.": ".attn.kv_norm.", ".self_attn.compressor.": ".attn.mla_attn.compressor.", - # Compressor projections for stacking (fused_wkv_wgate) - ".compressor.kv_proj.": ".compressor.wkv.", - ".compressor.gate_proj.": ".compressor.gate.", # Shared expert projections (stacking into gate_up_proj) - # Checkpoint has .shared_experts. but model has .ffn.shared_experts. - ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", - ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", + # Must include .mlp. prefix since break prevents .mlp.→.ffn. from + # firing on the same key after these patterns match. + ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.", + ".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", }