Revert "[Kernel] Use flash-attn for decoding (#3648)" (#4820)

Lora 3 & 4 test seems to have illegal memory access failure after this commit; [2024-05-14 23:51:18,182 E 22 22] logging.cc:101: Unhandled exception: N3c105ErrorE. what(): CUDA error: an illegal memory access was encountered <br class="Apple-interchange-newline"> Exmaple: https://buildkite.com/vllm/ci/builds/7382#018f793d-1527-4e1c-ab59-c3a34ec55241 This reverts commit 1356df5. FILL IN THE PR DESCRIPTION HERE FIX #xxxx (link existing issues this PR will resolve)
2024-05-15 11:52:45 +09:00
parent 29bc01bf3b
commit 8a7cc254a0
6 changed files with 65 additions and 313 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -93,20 +93,6 @@ def _which_attn_to_use(
                    "torch.float16 or torch.bfloat16.")
        return _Backend.XFORMERS

-    if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
-        logger.info("Cannot use FlashAttention-2 backend for FP8 KV cache.")
-        return _Backend.XFORMERS
-
-    if block_size % 16 != 0:
-        logger.info("Cannot use FlashAttention-2 backend for block size not "
-                    "divisible by 16.")
-        return _Backend.XFORMERS
-
-    if sliding_window is not None:
-        logger.info(
-            "Cannot use FlashAttention-2 backend due to sliding window.")
-        return _Backend.XFORMERS
-
    try:
        import vllm_flash_attn  # noqa: F401
    except ImportError: