Revert "[Core][Kernels] Use FlashInfer backend for FP8 KV Cache when available." (#7982)

2024-08-28 21:27:06 -07:00
parent 74d5543ec5
commit ef99a78760
3 changed files with 12 additions and 249 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -226,10 +226,6 @@ def which_attn_to_use(
        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
            logger.info(
                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by set environment "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
            selected_backend = _Backend.XFORMERS
        elif block_size % 16 != 0:
            logger.info(