diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index 446dc1c6..19bf6861 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -742,6 +742,12 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): kv_rope_prefill = self._apply_rope_kv( kv[num_decode_tokens:], positions[num_decode_tokens:], ) + # Debug: check attention inputs + import sys as _sys + _q_nan = torch.isnan(q_prefill).any().item() + _kv_nan = torch.isnan(kv_rope_prefill).any().item() + if _q_nan or _kv_nan: + print(f"[BLACKWELL] PREFILL INPUTS NaN: q_nan={_q_nan} kv_nan={_kv_nan} cr={self.compress_ratio}", file=_sys.stderr, flush=True) if swa_only: o[num_decode_tokens:] = causal_prefill_attention( q_prefill, kv_rope_prefill, self.scale,