[v1] Add encoder-only/cross attention support to Triton Attention backend (#31406)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-06 00:00:23 +08:00
committed by GitHub
parent 911d38ed99
commit 6aa5b18e1d
6 changed files with 627 additions and 14 deletions

View File

@@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Optional
import torch
import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.logger import init_logger
from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -289,14 +288,6 @@ class RocmPlatform(Platform):
logger.info("Using Aiter Flash Attention backend.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Priority 5: If model is Encoder-only self-attention type
if (
attn_selector_config.attn_type is not None
and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY
):
logger.info("Using FlexAttention backend.")
return AttentionBackendEnum.FLEX_ATTENTION.get_path()
# Default: Triton Unified Attention
logger.info("Using Triton Attention backend.")
return AttentionBackendEnum.TRITON_ATTN.get_path()