Fix _apply_rope_kv: use inline RoPE instead of 3D apply_gptj_rope

2026-05-19 10:36:21 +00:00
parent 8e6721917e
commit dca8bfc3a8
1 changed files with 9 additions and 4 deletions
--- a/vllm/patches/deepseek_v4_attention.py
+++ b/vllm/patches/deepseek_v4_attention.py
@@ -655,12 +655,17 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):

    def _apply_rope_kv(self, kv, positions):
        """Apply GPT-J RoPE to KV latent and return the result."""
-        from vllm.model_executor.layers.csa_attention import apply_gptj_rope
        half = self.rope_head_dim // 2
        cos = self.rotary_emb.cos_sin_cache[positions, :half].to(kv.dtype)
-        sin = self.rotary_emb.cos_sin_cache[positions, half:].to(kv.dtype)
-        kv_rope = apply_gptj_rope(kv.unsqueeze(1), cos, sin, self.nope_head_dim).squeeze(1)
-        return kv_rope
+        sin = self.rotary_emb.cos_sin_cache[positions, half:2*half].to(kv.dtype)
+        # kv: (T, HD) — apply RoPE to the rope portion (after nope_dim)
+        kv_rope = kv[:, self.nope_head_dim:].clone()
+        even = kv_rope[:, 0::2]
+        odd = kv_rope[:, 1::2]
+        out = kv.clone()
+        out[:, self.nope_head_dim:][:, 0::2] = even * cos - odd * sin
+        out[:, self.nope_head_dim:][:, 1::2] = even * sin + odd * cos
+        return out

    def _fused_qnorm_rope_kv_insert(
        self,