fix: hc params dot→underscore + compressor position_bias→ape combined rule
Two fixes: 1. attn_hc.base → hc_attn_base (underscore not dot before base/fn/scale) Same for fn, scale, and ffn_hc variants. 2. compressor.position_bias → compressor.ape was never firing because the .self_attn.compressor. rule matched first (break). Added combined .self_attn.compressor.position_bias → .attn.mla_attn.compressor.ape.
This commit is contained in:
@@ -1283,6 +1283,7 @@ class DeepseekV4Model(nn.Module):
|
||||
".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.",
|
||||
".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.",
|
||||
".self_attn.compressor.kv_norm.": ".attn.kv_norm.",
|
||||
".self_attn.compressor.position_bias": ".attn.mla_attn.compressor.ape",
|
||||
".self_attn.compressor.": ".attn.mla_attn.compressor.",
|
||||
# Shared expert projections (stacking into gate_up_proj)
|
||||
# Must include .mlp. prefix since break prevents .mlp.→.ffn. from
|
||||
@@ -1290,9 +1291,15 @@ class DeepseekV4Model(nn.Module):
|
||||
".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
|
||||
".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
|
||||
".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
|
||||
# Hadamard coding params
|
||||
".attn_hc.": ".hc_attn.",
|
||||
".ffn_hc.": ".hc_ffn.",
|
||||
# Hadamard coding params: checkpoint has .attn_hc.base/fn/scale
|
||||
# and .ffn_hc.base/fn/scale; model has hc_attn_base/fn/scale
|
||||
# and hc_ffn_base/fn/scale (underscore not dot before base/fn/scale)
|
||||
".attn_hc.base": "hc_attn_base",
|
||||
".attn_hc.fn": "hc_attn_fn",
|
||||
".attn_hc.scale": "hc_attn_scale",
|
||||
".ffn_hc.base": "hc_ffn_base",
|
||||
".ffn_hc.fn": "hc_ffn_fn",
|
||||
".ffn_hc.scale": "hc_ffn_scale",
|
||||
"hc_head.hc_base": "hc_head_base",
|
||||
"hc_head.hc_fn": "hc_head_fn",
|
||||
"hc_head.hc_scale": "hc_head_scale",
|
||||
|
||||
Reference in New Issue
Block a user