Fix NVFP4 mapper: add attention projection renames, remove norm_gate renames

- Add specific .self_attn.{q_a,kv,q_b,o_a,o_b}_proj → .attn.{wq_a,wkv,wq_b,wo_a,wo_b}
- Remove norm_gate suffix renames (nightly uses 'gate' not 'norm_gate')
- Order substr renames: specific before general
This commit is contained in:
2026-05-18 22:53:09 +00:00
parent ea648a9bc2
commit b039123207

View File

@@ -1650,17 +1650,25 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
suffix_renames = {
"head.weight": "lm_head.weight",
"embed.weight": "embed_tokens.weight",
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
}
# NOTE: specific renames MUST come before general ones (applied in order)
substr_renames = {
# Attention projections (specific before .self_attn. → .attn.)
".self_attn.q_a_proj.": ".attn.wq_a.",
".self_attn.kv_proj.": ".attn.wkv.",
".self_attn.q_b_proj.": ".attn.wq_b.",
".self_attn.o_a_proj.": ".attn.wo_a.",
".self_attn.o_b_proj.": ".attn.wo_b.",
".self_attn.q_a_norm.": ".attn.q_a_norm.",
".self_attn.kv_norm.": ".attn.kv_norm.",
".self_attn.sinks": ".attn.sinks",
".attn.compressor.": ".attn.mla_attn.compressor.",
# Shared expert projections (specific before .mlp. → .ffn.)
".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
# General renames
".mlp.": ".ffn.",
".self_attn.": ".attn.",
}