[Kernel] Build flash-attn from source (#8245)

2024-09-21 02:27:10 -04:00
parent 0faab90eb0
commit 71c60491f2
9 changed files with 124 additions and 41 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -244,8 +244,7 @@ def which_attn_to_use(
    # FlashAttn is valid for the model, checking if the package is installed.
    if selected_backend == _Backend.FLASH_ATTN:
        try:
-            import vllm_flash_attn  # noqa: F401
-
+            import vllm.vllm_flash_attn  # noqa: F401
            from vllm.attention.backends.flash_attn import (  # noqa: F401
                FlashAttentionBackend)

@@ -258,8 +257,9 @@ def which_attn_to_use(
        except ImportError:
            logger.info(
                "Cannot use FlashAttention-2 backend because the "
-                "vllm_flash_attn package is not found. "
-                "`pip install vllm-flash-attn` for better performance.")
+                "vllm.vllm_flash_attn package is not found. "
+                "Make sure that vllm_flash_attn was built and installed "
+                "(on by default).")
            selected_backend = _Backend.XFORMERS

    return selected_backend