diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index 22317957..641e3f19 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -655,12 +655,17 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): def _apply_rope_kv(self, kv, positions): """Apply GPT-J RoPE to KV latent and return the result.""" - from vllm.model_executor.layers.csa_attention import apply_gptj_rope half = self.rope_head_dim // 2 cos = self.rotary_emb.cos_sin_cache[positions, :half].to(kv.dtype) - sin = self.rotary_emb.cos_sin_cache[positions, half:].to(kv.dtype) - kv_rope = apply_gptj_rope(kv.unsqueeze(1), cos, sin, self.nope_head_dim).squeeze(1) - return kv_rope + sin = self.rotary_emb.cos_sin_cache[positions, half:2*half].to(kv.dtype) + # kv: (T, HD) — apply RoPE to the rope portion (after nope_dim) + kv_rope = kv[:, self.nope_head_dim:].clone() + even = kv_rope[:, 0::2] + odd = kv_rope[:, 1::2] + out = kv.clone() + out[:, self.nope_head_dim:][:, 0::2] = even * cos - odd * sin + out[:, self.nope_head_dim:][:, 1::2] = even * sin + odd * cos + return out def _fused_qnorm_rope_kv_insert( self,