Apply RoPE to KV in Blackwell attention path - fix NaN output
This commit is contained in:
@@ -630,8 +630,12 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
|
||||
out.zero_()
|
||||
return
|
||||
|
||||
# Apply RoPE to KV (required for correct attention scores)
|
||||
# Q already has RoPE from fused_qnorm_rope_kv_insert_py or _apply_rope_q
|
||||
kv_rope = self._apply_rope_kv(kv, positions)
|
||||
|
||||
# Attention using PyTorch SDPA (works on Blackwell)
|
||||
o = full_sdpa_attention(q, kv, self.scale)
|
||||
o = full_sdpa_attention(q, kv_rope, self.scale)
|
||||
|
||||
# Write into the output buffer (same shape as original path)
|
||||
if self.n_local_heads < self.padded_heads:
|
||||
@@ -649,6 +653,15 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
|
||||
q[:, :, self.nope_head_dim:][:, :, 0::2] = q_rope[:, :, 0::2] * cos_q - q_rope[:, :, 1::2] * sin_q
|
||||
q[:, :, self.nope_head_dim:][:, :, 1::2] = q_rope[:, :, 0::2] * sin_q + q_rope[:, :, 1::2] * cos_q
|
||||
|
||||
def _apply_rope_kv(self, kv, positions):
|
||||
"""Apply GPT-J RoPE to KV latent and return the result."""
|
||||
from vllm.model_executor.layers.csa_attention import apply_gptj_rope
|
||||
half = self.rope_head_dim // 2
|
||||
cos = self.rotary_emb.cos_sin_cache[positions, :half].to(kv.dtype)
|
||||
sin = self.rotary_emb.cos_sin_cache[positions, half:].to(kv.dtype)
|
||||
kv_rope = apply_gptj_rope(kv.unsqueeze(1), cos, sin, self.nope_head_dim).squeeze(1)
|
||||
return kv_rope
|
||||
|
||||
def _fused_qnorm_rope_kv_insert(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
|
||||
Reference in New Issue
Block a user