[V1] Enable Triton(ROCm) Attention backend for Nvidia GPUs (#14071)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Isotr0py
2025-03-21 11:14:19 +08:00
committed by GitHub
parent b15fd2be2a
commit f8a08cb90d
5 changed files with 23 additions and 16 deletions

View File

@@ -120,8 +120,9 @@ class RocmPlatform(Platform):
selected_backend = (_Backend.ROCM_FLASH if selected_backend
== _Backend.FLASH_ATTN else selected_backend)
if envs.VLLM_USE_V1:
logger.info("Using ROCm Attention backend on V1 engine.")
return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
logger.info("Using Triton Attention backend on V1 engine.")
return ("vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend")
if selected_backend == _Backend.ROCM_FLASH:
if not cls.has_device_capability(90):
# not Instinct series GPUs.