[Bugfix][Kernel] Add head size check for attention backend selection (#4944)

2024-05-22 03:33:25 +08:00
parent 14772eeb8e
commit 99eff67ba9
2 changed files with 21 additions and 7 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -34,11 +34,21 @@ def get_attn_backend(
                                 sliding_window, dtype, kv_cache_dtype,
                                 block_size)
    if backend == _Backend.FLASH_ATTN:
-        logger.info("Using FlashAttention-2 backend.")
        from vllm.attention.backends.flash_attn import (  # noqa: F401
            FlashAttentionBackend)
-        return FlashAttentionBackend
-    elif backend == _Backend.XFORMERS:
+
+        # We check it here not in _which_attn_to_use because we cannot know
+        # the head size until we import FlashAttentionBackend.
+        supported_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
+        if head_size in supported_head_sizes:
+            logger.info("Using FlashAttention-2 backend.")
+            return FlashAttentionBackend
+        logger.info(
+            "Cannot use FlashAttention-2 backend for head size %d. "
+            "Using XFormers backend instead.", head_size)
+        backend = _Backend.XFORMERS
+
+    if backend == _Backend.XFORMERS:
        logger.info("Using XFormers backend.")
        from vllm.attention.backends.xformers import (  # noqa: F401
            XFormersBackend)