[Misc] Log the reason for falling back to FlexAttention (#20699)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-14 19:16:51 +08:00
parent a4851cfe68
commit e8cc53af5e
10 changed files with 105 additions and 33 deletions
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -37,6 +37,10 @@ logger = init_logger(__name__)
 class TorchSDPABackend(AttentionBackend):
    accept_output_buffer: bool = False

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16, torch.float32]
+
    @classmethod
    def validate_head_size(cls, head_size: int) -> None:
        attn_impl = _get_paged_attn_impl()
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -44,6 +44,10 @@ class FlashAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -42,6 +42,10 @@ class FlashInferBackend(AttentionBackend):
    accept_output_buffer: bool = True
    cached_sm100a_supported: Optional[bool] = None

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -42,6 +42,10 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
 class FlexAttentionBackend(AttentionBackend):
    accept_output_buffer: bool = True

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16, torch.float32]
+
    @classmethod
    def validate_head_size(cls, head_size: int) -> None:
        return  # FlexAttention supports any head size
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -262,6 +262,10 @@ class MLACommonBackend(AttentionBackend):
    ) -> tuple[int, ...]:
        return (num_blocks, block_size, head_size)

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        return [576]
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -314,6 +314,10 @@ class AiterFlashAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -190,6 +190,10 @@ class TritonAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]