diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index e90eace6..09364b87 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -659,6 +659,16 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): self._swa_inv_scale_cache = torch.zeros( max_slots, 1, dtype=torch.bfloat16, device=kv.device, ) + # Debug: log cache shape info + import sys + print(f"[BLACKWELL] swa_kv_cache shape: {swa_kv_cache.shape}, " + f"block_size: {swa_metadata.block_size}, " + f"num_decode_tokens: {num_decode_tokens}, " + f"num_prefills: {num_prefills}, " + f"compress_ratio: {self.compress_ratio}, " + f"slot_mapping shape: {swa_metadata.slot_mapping.shape}, " + f"positions shape: {positions.shape}, " + f"kv shape: {kv.shape}", file=sys.stderr, flush=True) blackwell_attention_kv_write( kv, positions, swa_kv_cache, self._swa_inv_scale_cache, swa_metadata.slot_mapping, swa_metadata.block_size, @@ -687,6 +697,8 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): # ── Decode attention ────────────────────────────────────── if num_decode_tokens > 0: + import sys + print(f"[BLACKWELL] DECODE: {num_decode_tokens} tokens, swa_only={swa_only}", file=sys.stderr, flush=True) if swa_only: # SWA-only layers: full decode attention with KV cache q_decode = q[:num_decode_tokens] @@ -722,6 +734,8 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): # ── Prefill attention ───────────────────────────────────── if num_prefills > 0: + import sys + print(f"[BLACKWELL] PREFILL: {num_prefills} tokens, swa_only={swa_only}", file=sys.stderr, flush=True) q_prefill = q[num_decode_tokens:] kv_rope_prefill = self._apply_rope_kv( kv[num_decode_tokens:], positions[num_decode_tokens:],