CRITICAL FIX: Skip compressor fused attention kernel on Blackwell — it bypasses our attention path

2026-05-19 16:35:07 +00:00
parent b8e2cf61ad
commit 30890b621d
1 changed files with 11 additions and 0 deletions
--- a/vllm/patches/layers/deepseek_compressor.py
+++ b/vllm/patches/layers/deepseek_compressor.py
@@ -337,6 +337,17 @@ class DeepseekCompressor(nn.Module):
        #   second half sin (per-pair, length rope_head_dim // 2 each)
        # - applied to LAST rope_head_dim elements of head_dim
        # - position used: (positions // compress_ratio) * compress_ratio
+        
+        # On Blackwell (SM100+), skip the fused kernel because:
+        # 1. The fused kernel does attention using FlashMLA which doesn't work on SM100
+        # 2. Our Blackwell attention path handles everything separately
+        # Instead, we just save the state (done above) and let the attention
+        # path handle compression + RoPE + cache write + attention.
+        cap = current_platform.get_device_capability()
+        if cap is not None and cap.major >= 10:
+            # Blackwell: state is already saved, skip fused kernel
+            return
+
        cos_sin_cache = rotary_emb.cos_sin_cache
        k_cache_metadata = cast(Any, attn_metadata[self.k_cache_prefix])
        kv_cache = self._static_forward_context[self.k_cache_prefix].kv_cache