diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index d091a4e96..96429e29b 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -572,6 +572,10 @@ class TritonAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, slot_mapping: torch.Tensor, ): + if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER): + # For encoder attention, + # we use direct Q, K, V tensors without caching + return # For decoder and cross-attention, use KV cache as before key_cache, value_cache = kv_cache.unbind(1)