From 21018fca8aa97b3f5a0d668685fcb3d4b7df43df Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 15 May 2026 00:17:59 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20shared=5Fexperts=20missing=20ffn.=20pref?= =?UTF-8?q?ix=20in=20checkpoint=E2=86=92model=20rename?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checkpoint keys are model.layers.N.shared_experts.gate_proj.weight but model params are layers.N.ffn.shared_experts.gate_up_proj.weight. The .ffn. was missing from the rename, so stacked gate_up_proj never matched params_dict. --- vllm/patches/deepseek_v4.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 3f519bc4..a6d7aa16 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -1277,8 +1277,9 @@ class DeepseekV4Model(nn.Module): ".compressor.kv_proj.": ".compressor.wkv.", ".compressor.gate_proj.": ".compressor.gate.", # Shared expert projections (stacking into gate_up_proj) - ".shared_experts.gate_proj.": ".shared_experts.w1.", - ".shared_experts.up_proj.": ".shared_experts.w3.", + # Checkpoint has .shared_experts. but model has .ffn.shared_experts. + ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", } @@ -2096,8 +2097,9 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: ".compressor.kv_proj.": ".compressor.wkv.", ".compressor.gate_proj.": ".compressor.wgate.", # Shared expert projections (stacking into gate_up_proj) - ".shared_experts.gate_proj.": ".shared_experts.w1.", - ".shared_experts.up_proj.": ".shared_experts.w3.", + # Checkpoint has .shared_experts. but model has .ffn.shared_experts. + ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", },