diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index 35446abd..22317957 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -630,8 +630,12 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): out.zero_() return + # Apply RoPE to KV (required for correct attention scores) + # Q already has RoPE from fused_qnorm_rope_kv_insert_py or _apply_rope_q + kv_rope = self._apply_rope_kv(kv, positions) + # Attention using PyTorch SDPA (works on Blackwell) - o = full_sdpa_attention(q, kv, self.scale) + o = full_sdpa_attention(q, kv_rope, self.scale) # Write into the output buffer (same shape as original path) if self.n_local_heads < self.padded_heads: @@ -649,6 +653,15 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): q[:, :, self.nope_head_dim:][:, :, 0::2] = q_rope[:, :, 0::2] * cos_q - q_rope[:, :, 1::2] * sin_q q[:, :, self.nope_head_dim:][:, :, 1::2] = q_rope[:, :, 0::2] * sin_q + q_rope[:, :, 1::2] * cos_q + def _apply_rope_kv(self, kv, positions): + """Apply GPT-J RoPE to KV latent and return the result.""" + from vllm.model_executor.layers.csa_attention import apply_gptj_rope + half = self.rope_head_dim // 2 + cos = self.rotary_emb.cos_sin_cache[positions, :half].to(kv.dtype) + sin = self.rotary_emb.cos_sin_cache[positions, half:].to(kv.dtype) + kv_rope = apply_gptj_rope(kv.unsqueeze(1), cos, sin, self.nope_head_dim).squeeze(1) + return kv_rope + def _fused_qnorm_rope_kv_insert( self, q: torch.Tensor,