fix: shared_experts missing ffn. prefix in checkpoint→model rename

Checkpoint keys are model.layers.N.shared_experts.gate_proj.weight
but model params are layers.N.ffn.shared_experts.gate_up_proj.weight.
The .ffn. was missing from the rename, so stacked gate_up_proj
never matched params_dict.
This commit is contained in:
2026-05-15 00:17:59 +00:00
parent 483046b9d6
commit 21018fca8a

View File

@@ -1277,8 +1277,9 @@ class DeepseekV4Model(nn.Module):
".compressor.kv_proj.": ".compressor.wkv.",
".compressor.gate_proj.": ".compressor.gate.",
# Shared expert projections (stacking into gate_up_proj)
".shared_experts.gate_proj.": ".shared_experts.w1.",
".shared_experts.up_proj.": ".shared_experts.w3.",
# Checkpoint has .shared_experts. but model has .ffn.shared_experts.
".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
# modelopt uses mlp, vllm uses ffn internally
".mlp.": ".ffn.",
}
@@ -2096,8 +2097,9 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
".compressor.kv_proj.": ".compressor.wkv.",
".compressor.gate_proj.": ".compressor.wgate.",
# Shared expert projections (stacking into gate_up_proj)
".shared_experts.gate_proj.": ".shared_experts.w1.",
".shared_experts.up_proj.": ".shared_experts.w3.",
# Checkpoint has .shared_experts. but model has .ffn.shared_experts.
".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
# modelopt uses mlp, vllm uses ffn internally
".mlp.": ".ffn.",
},