diff --git a/vllm/patches/layers/deepseek_compressor.py b/vllm/patches/layers/deepseek_compressor.py index 181ac58d..f7d2a391 100644 --- a/vllm/patches/layers/deepseek_compressor.py +++ b/vllm/patches/layers/deepseek_compressor.py @@ -337,6 +337,17 @@ class DeepseekCompressor(nn.Module): # second half sin (per-pair, length rope_head_dim // 2 each) # - applied to LAST rope_head_dim elements of head_dim # - position used: (positions // compress_ratio) * compress_ratio + + # On Blackwell (SM100+), skip the fused kernel because: + # 1. The fused kernel does attention using FlashMLA which doesn't work on SM100 + # 2. Our Blackwell attention path handles everything separately + # Instead, we just save the state (done above) and let the attention + # path handle compression + RoPE + cache write + attention. + cap = current_platform.get_device_capability() + if cap is not None and cap.major >= 10: + # Blackwell: state is already saved, skip fused kernel + return + cos_sin_cache = rotary_emb.cos_sin_cache k_cache_metadata = cast(Any, attn_metadata[self.k_cache_prefix]) kv_cache = self._static_forward_context[self.k_cache_prefix].kv_cache