[Attention] Refactor FA block_size limitations to hybrid models only (#29084)

Signed-off-by: NickLucche <nlucches@redhat.com>
2025-11-22 15:38:44 +01:00
parent 5f7209a793
commit 066209a045
17 changed files with 82 additions and 32 deletions
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend):
        torch.bfloat16,
        torch.float32,
    ]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
        "auto",
        "fp8",
@@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend):
        "fp8_e5m2",
    ]

+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
    @staticmethod
    def get_name() -> str:
        return "TRITON_ATTN"