CRITICAL FIX: Skip compressor fused attention kernel on Blackwell — it bypasses our attention path
This commit is contained in:
@@ -337,6 +337,17 @@ class DeepseekCompressor(nn.Module):
|
||||
# second half sin (per-pair, length rope_head_dim // 2 each)
|
||||
# - applied to LAST rope_head_dim elements of head_dim
|
||||
# - position used: (positions // compress_ratio) * compress_ratio
|
||||
|
||||
# On Blackwell (SM100+), skip the fused kernel because:
|
||||
# 1. The fused kernel does attention using FlashMLA which doesn't work on SM100
|
||||
# 2. Our Blackwell attention path handles everything separately
|
||||
# Instead, we just save the state (done above) and let the attention
|
||||
# path handle compression + RoPE + cache write + attention.
|
||||
cap = current_platform.get_device_capability()
|
||||
if cap is not None and cap.major >= 10:
|
||||
# Blackwell: state is already saved, skip fused kernel
|
||||
return
|
||||
|
||||
cos_sin_cache = rotary_emb.cos_sin_cache
|
||||
k_cache_metadata = cast(Any, attn_metadata[self.k_cache_prefix])
|
||||
kv_cache = self._static_forward_context[self.k_cache_prefix].kv_cache
|
||||
|
||||
Reference in New Issue
Block a user