[Bugfix][Kernel] Fix CUDA 11.8 being broken by FA3 build (#12375)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-24 10:27:59 -05:00
parent 3bb8e2c9a2
commit ab5bbf5ae3
6 changed files with 42 additions and 22 deletions
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -6,7 +6,9 @@ import torch
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (cascade_attention,
                                                   merge_attn_states)
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  is_fa_version_supported)

 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -91,10 +93,9 @@ def test_cascade(
    fa_version: int,
 ) -> None:
    torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")

    current_platform.seed_everything(0)