diff --git a/vllm/patches/layers/deepseek_compressor.py b/vllm/patches/layers/deepseek_compressor.py index f7d2a391..4bf89163 100644 --- a/vllm/patches/layers/deepseek_compressor.py +++ b/vllm/patches/layers/deepseek_compressor.py @@ -15,6 +15,15 @@ from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, ) from vllm.platforms import current_platform + +# Check at module load time if we're on Blackwell +_IS_BLACKWELL = False +try: + _cap = current_platform.get_device_capability() + if _cap is not None and _cap.major >= 10: + _IS_BLACKWELL = True +except Exception: + pass from vllm.triton_utils import tl, triton from vllm.v1.attention.backend import ( AttentionBackend, @@ -343,8 +352,7 @@ class DeepseekCompressor(nn.Module): # 2. Our Blackwell attention path handles everything separately # Instead, we just save the state (done above) and let the attention # path handle compression + RoPE + cache write + attention. - cap = current_platform.get_device_capability() - if cap is not None and cap.major >= 10: + if _IS_BLACKWELL: # Blackwell: state is already saved, skip fused kernel return