[Core][Kernels] Use FlashInfer backend for FP8 KV Cache when available. (#7798)

Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
Pavani Majety
2024-08-28 10:01:22 -07:00
committed by GitHub
parent ef9baee3c5
commit b98cc28f91
3 changed files with 249 additions and 12 deletions

View File

@@ -226,6 +226,10 @@ def which_attn_to_use(
elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
logger.info(
"Cannot use FlashAttention-2 backend for FP8 KV cache.")
logger.warning(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by set environment "
"VLLM_ATTENTION_BACKEND=FLASHINFER")
selected_backend = _Backend.XFORMERS
elif block_size % 16 != 0:
logger.info(