fix: hc params dot→underscore + compressor position_bias→ape combined rule

Two fixes:
1. attn_hc.base → hc_attn_base (underscore not dot before base/fn/scale)
   Same for fn, scale, and ffn_hc variants.
2. compressor.position_bias → compressor.ape was never firing because
   the .self_attn.compressor. rule matched first (break). Added combined
   .self_attn.compressor.position_bias → .attn.mla_attn.compressor.ape.
This commit is contained in:
2026-05-15 01:29:00 +00:00
parent 44d4b6c225
commit 086f3fa5c5

View File

@@ -1283,6 +1283,7 @@ class DeepseekV4Model(nn.Module):
".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.",
".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.",
".self_attn.compressor.kv_norm.": ".attn.kv_norm.",
".self_attn.compressor.position_bias": ".attn.mla_attn.compressor.ape",
".self_attn.compressor.": ".attn.mla_attn.compressor.",
# Shared expert projections (stacking into gate_up_proj)
# Must include .mlp. prefix since break prevents .mlp.→.ffn. from
@@ -1290,9 +1291,15 @@ class DeepseekV4Model(nn.Module):
".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
# Hadamard coding params
".attn_hc.": ".hc_attn.",
".ffn_hc.": ".hc_ffn.",
# Hadamard coding params: checkpoint has .attn_hc.base/fn/scale
# and .ffn_hc.base/fn/scale; model has hc_attn_base/fn/scale
# and hc_ffn_base/fn/scale (underscore not dot before base/fn/scale)
".attn_hc.base": "hc_attn_base",
".attn_hc.fn": "hc_attn_fn",
".attn_hc.scale": "hc_attn_scale",
".ffn_hc.base": "hc_ffn_base",
".ffn_hc.fn": "hc_ffn_fn",
".ffn_hc.scale": "hc_ffn_scale",
"hc_head.hc_base": "hc_head_base",
"hc_head.hc_fn": "hc_head_fn",
"hc_head.hc_scale": "hc_head_scale",