fix: shared_experts missing ffn. prefix in checkpoint→model rename

Checkpoint keys are model.layers.N.shared_experts.gate_proj.weight but model params are layers.N.ffn.shared_experts.gate_up_proj.weight. The .ffn. was missing from the rename, so stacked gate_up_proj never matched params_dict.
2026-05-15 00:17:59 +00:00
parent 483046b9d6
commit 21018fca8a
1 changed files with 6 additions and 4 deletions
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -1277,8 +1277,9 @@ class DeepseekV4Model(nn.Module):
            ".compressor.kv_proj.": ".compressor.wkv.",
            ".compressor.gate_proj.": ".compressor.gate.",
            # Shared expert projections (stacking into gate_up_proj)
-            ".shared_experts.gate_proj.": ".shared_experts.w1.",
-            ".shared_experts.up_proj.": ".shared_experts.w3.",
+            # Checkpoint has .shared_experts. but model has .ffn.shared_experts.
+            ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
+            ".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
            # modelopt uses mlp, vllm uses ffn internally
            ".mlp.": ".ffn.",
        }
@@ -2096,8 +2097,9 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
            ".compressor.kv_proj.": ".compressor.wkv.",
            ".compressor.gate_proj.": ".compressor.wgate.",
            # Shared expert projections (stacking into gate_up_proj)
-            ".shared_experts.gate_proj.": ".shared_experts.w1.",
-            ".shared_experts.up_proj.": ".shared_experts.w3.",
+            # Checkpoint has .shared_experts. but model has .ffn.shared_experts.
+            ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
+            ".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
            # modelopt uses mlp, vllm uses ffn internally
            ".mlp.": ".ffn.",
        },