[V1] Enable Triton(ROCm) Attention backend for Nvidia GPUs (#14071)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-21 11:14:19 +08:00
parent b15fd2be2a
commit f8a08cb90d
5 changed files with 23 additions and 16 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -213,9 +213,14 @@ class CudaPlatformBase(Platform):
                        return ("vllm.attention.backends."
                                "flashmla.FlashMLABackend")
        if use_v1:
-            logger.info_once("Using Flash Attention backend on V1 engine.")
-            return ("vllm.v1.attention.backends.flash_attn."
-                    "FlashAttentionBackend")
+            if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+            if cls.has_device_capability(80):
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "flash_attn.FlashAttentionBackend")
        if selected_backend == _Backend.FLASHINFER:
            logger.info("Using FlashInfer backend.")
            return "vllm.attention.backends.flashinfer.FlashInferBackend"
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -29,6 +29,7 @@ def in_wsl() -> bool:
 class _Backend(enum.Enum):
    FLASH_ATTN = enum.auto()
    FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
    XFORMERS = enum.auto()
    ROCM_FLASH = enum.auto()
    TORCH_SDPA = enum.auto()
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -120,8 +120,9 @@ class RocmPlatform(Platform):
        selected_backend = (_Backend.ROCM_FLASH if selected_backend
                            == _Backend.FLASH_ATTN else selected_backend)
        if envs.VLLM_USE_V1:
-            logger.info("Using ROCm Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends."
+                    "triton_attn.TritonAttentionBackend")
        if selected_backend == _Backend.ROCM_FLASH:
            if not cls.has_device_capability(90):
                # not Instinct series GPUs.