fix: shared_experts missing ffn. prefix in checkpoint→model rename
Checkpoint keys are model.layers.N.shared_experts.gate_proj.weight but model params are layers.N.ffn.shared_experts.gate_up_proj.weight. The .ffn. was missing from the rename, so stacked gate_up_proj never matched params_dict.
This commit is contained in:
@@ -1277,8 +1277,9 @@ class DeepseekV4Model(nn.Module):
|
||||
".compressor.kv_proj.": ".compressor.wkv.",
|
||||
".compressor.gate_proj.": ".compressor.gate.",
|
||||
# Shared expert projections (stacking into gate_up_proj)
|
||||
".shared_experts.gate_proj.": ".shared_experts.w1.",
|
||||
".shared_experts.up_proj.": ".shared_experts.w3.",
|
||||
# Checkpoint has .shared_experts. but model has .ffn.shared_experts.
|
||||
".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
|
||||
".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
|
||||
# modelopt uses mlp, vllm uses ffn internally
|
||||
".mlp.": ".ffn.",
|
||||
}
|
||||
@@ -2096,8 +2097,9 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
|
||||
".compressor.kv_proj.": ".compressor.wkv.",
|
||||
".compressor.gate_proj.": ".compressor.wgate.",
|
||||
# Shared expert projections (stacking into gate_up_proj)
|
||||
".shared_experts.gate_proj.": ".shared_experts.w1.",
|
||||
".shared_experts.up_proj.": ".shared_experts.w3.",
|
||||
# Checkpoint has .shared_experts. but model has .ffn.shared_experts.
|
||||
".shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
|
||||
".shared_experts.up_proj.": ".ffn.shared_experts.w3.",
|
||||
# modelopt uses mlp, vllm uses ffn internally
|
||||
".mlp.": ".ffn.",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user