diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 3f519bc4..a6d7aa16 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -1277,8 +1277,9 @@ class DeepseekV4Model(nn.Module): ".compressor.kv_proj.": ".compressor.wkv.", ".compressor.gate_proj.": ".compressor.gate.", # Shared expert projections (stacking into gate_up_proj) - ".shared_experts.gate_proj.": ".shared_experts.w1.", - ".shared_experts.up_proj.": ".shared_experts.w3.", + # Checkpoint has .shared_experts. but model has .ffn.shared_experts. + ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", } @@ -2096,8 +2097,9 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: ".compressor.kv_proj.": ".compressor.wkv.", ".compressor.gate_proj.": ".compressor.wgate.", # Shared expert projections (stacking into gate_up_proj) - ".shared_experts.gate_proj.": ".shared_experts.w1.", - ".shared_experts.up_proj.": ".shared_experts.w3.", + # Checkpoint has .shared_experts. but model has .ffn.shared_experts. + ".shared_experts.gate_proj.": ".ffn.shared_experts.w1.", + ".shared_experts.up_proj.": ".ffn.shared_experts.w3.", # modelopt uses mlp, vllm uses ffn internally ".mlp.": ".ffn.", },