fix: hc params dot→underscore + compressor position_bias→ape combined rule

Two fixes: 1. attn_hc.base → hc_attn_base (underscore not dot before base/fn/scale) Same for fn, scale, and ffn_hc variants. 2. compressor.position_bias → compressor.ape was never firing because the .self_attn.compressor. rule matched first (break). Added combined .self_attn.compressor.position_bias → .attn.mla_attn.compressor.ape.
2026-05-15 01:29:00 +00:00
parent 44d4b6c225
commit 086f3fa5c5
1 changed files with 10 additions and 3 deletions
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -1283,6 +1283,7 @@ class DeepseekV4Model(nn.Module):
            ".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.",
            ".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.",
            ".self_attn.compressor.kv_norm.": ".attn.kv_norm.",
+            ".self_attn.compressor.position_bias": ".attn.mla_attn.compressor.ape",
            ".self_attn.compressor.": ".attn.mla_attn.compressor.",
            # Shared expert projections (stacking into gate_up_proj)
            # Must include .mlp. prefix since break prevents .mlp.→.ffn. from
@@ -1290,9 +1291,15 @@ class DeepseekV4Model(nn.Module):
            ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.",
            ".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.",
            ".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.",
-            # Hadamard coding params
-            ".attn_hc.": ".hc_attn.",
-            ".ffn_hc.": ".hc_ffn.",
+            # Hadamard coding params: checkpoint has .attn_hc.base/fn/scale
+            # and .ffn_hc.base/fn/scale; model has hc_attn_base/fn/scale
+            # and hc_ffn_base/fn/scale (underscore not dot before base/fn/scale)
+            ".attn_hc.base": "hc_attn_base",
+            ".attn_hc.fn": "hc_attn_fn",
+            ".attn_hc.scale": "hc_attn_scale",
+            ".ffn_hc.base": "hc_ffn_base",
+            ".ffn_hc.fn": "hc_ffn_fn",
+            ".ffn_hc.scale": "hc_ffn_scale",
            "hc_head.hc_base": "hc_head_base",
            "hc_head.hc_fn": "hc_head_fn",
            "hc_head.hc_scale": "hc_head_scale",