CRITICAL FIX: Skip compressor fused attention kernel on Blackwell — it bypasses our attention path

This commit is contained in:
2026-05-19 16:35:07 +00:00
parent b8e2cf61ad
commit 30890b621d

View File

@@ -337,6 +337,17 @@ class DeepseekCompressor(nn.Module):
# second half sin (per-pair, length rope_head_dim // 2 each)
# - applied to LAST rope_head_dim elements of head_dim
# - position used: (positions // compress_ratio) * compress_ratio
# On Blackwell (SM100+), skip the fused kernel because:
# 1. The fused kernel does attention using FlashMLA which doesn't work on SM100
# 2. Our Blackwell attention path handles everything separately
# Instead, we just save the state (done above) and let the attention
# path handle compression + RoPE + cache write + attention.
cap = current_platform.get_device_capability()
if cap is not None and cap.major >= 10:
# Blackwell: state is already saved, skip fused kernel
return
cos_sin_cache = rotary_emb.cos_sin_cache
k_cache_metadata = cast(Any, attn_metadata[self.k_cache_prefix])
kv_cache = self._static_forward_context[self.k_cache_prefix].kv_cache