Fix NVFP4 mapper: add attention projection renames, remove norm_gate renames
- Add specific .self_attn.{q_a,kv,q_b,o_a,o_b}_proj → .attn.{wq_a,wkv,wq_b,wo_a,wo_b}
- Remove norm_gate suffix renames (nightly uses 'gate' not 'norm_gate')
- Order substr renames: specific before general
This commit is contained in:
@@ -1650,17 +1650,25 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper:
|
||||
suffix_renames = {
|
||||
"head.weight": "lm_head.weight",
|
||||
"embed.weight": "embed_tokens.weight",
|
||||
".ffn_norm.weight": ".ffn.norm_gate.norm.weight",
|
||||
".ffn.gate.weight": ".ffn.norm_gate.gate.weight",
|
||||
".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias",
|
||||
".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid",
|
||||
}
|
||||
|
||||
# NOTE: specific renames MUST come before general ones (applied in order)
|
||||
substr_renames = {
|
||||
# Attention projections (specific before .self_attn. → .attn.)
|
||||
".self_attn.q_a_proj.": ".attn.wq_a.",
|
||||
".self_attn.kv_proj.": ".attn.wkv.",
|
||||
".self_attn.q_b_proj.": ".attn.wq_b.",
|
||||
".self_attn.o_a_proj.": ".attn.wo_a.",
|
||||
".self_attn.o_b_proj.": ".attn.wo_b.",
|
||||
".self_attn.q_a_norm.": ".attn.q_a_norm.",
|
||||
".self_attn.kv_norm.": ".attn.kv_norm.",
|
||||
".self_attn.sinks": ".attn.sinks",
|
||||
".attn.compressor.": ".attn.mla_attn.compressor.",
|
||||
# Shared expert projections (specific before .mlp. → .ffn.)
|
||||
".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
|
||||
".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
|
||||
".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
|
||||
# General renames
|
||||
".mlp.": ".ffn.",
|
||||
".self_attn.": ".attn.",
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user