[Core][Kernels] Enable FP8 KV Cache with Flashinfer backend. + BugFix for kv_cache_dtype=auto (#7985)

Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-08-29 11:53:11 -07:00
parent 3f60f2244e
commit 6b3421567d
3 changed files with 250 additions and 12 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -226,6 +226,10 @@ def which_attn_to_use(
        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
            logger.info(
                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
            selected_backend = _Backend.XFORMERS
        elif block_size % 16 != 0:
            logger.info(