diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 1854f922..8e2f187b 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -1283,6 +1283,7 @@ class DeepseekV4Model(nn.Module): ".self_attn.compressor.kv_proj.": ".attn.mla_attn.compressor.wkv.", ".self_attn.compressor.gate_proj.": ".attn.mla_attn.compressor.gate.", ".self_attn.compressor.kv_norm.": ".attn.kv_norm.", + ".self_attn.compressor.position_bias": ".attn.mla_attn.compressor.ape", ".self_attn.compressor.": ".attn.mla_attn.compressor.", # Shared expert projections (stacking into gate_up_proj) # Must include .mlp. prefix since break prevents .mlp.→.ffn. from @@ -1290,9 +1291,15 @@ class DeepseekV4Model(nn.Module): ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.", ".mlp.shared_experts.up_proj.": ".ffn.shared_experts.w3.", ".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.", - # Hadamard coding params - ".attn_hc.": ".hc_attn.", - ".ffn_hc.": ".hc_ffn.", + # Hadamard coding params: checkpoint has .attn_hc.base/fn/scale + # and .ffn_hc.base/fn/scale; model has hc_attn_base/fn/scale + # and hc_ffn_base/fn/scale (underscore not dot before base/fn/scale) + ".attn_hc.base": "hc_attn_base", + ".attn_hc.fn": "hc_attn_fn", + ".attn_hc.scale": "hc_attn_scale", + ".ffn_hc.base": "hc_ffn_base", + ".ffn_hc.fn": "hc_ffn_fn", + ".ffn_hc.scale": "hc_ffn_scale", "hc_head.hc_base": "hc_head_base", "hc_head.hc_fn": "hc_head_fn", "hc_head.hc_scale": "hc_head_scale",