[Kernel] Flashinfer for prefill & decode, with Cudagraph support for decode (#4628)

Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>, bong-furiosa <bongwon.jang@furiosa.ai>
2024-06-28 15:28:49 -07:00
parent 6a62cb82cc
commit 7041de4384
7 changed files with 313 additions and 117 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,8 +77,9 @@ def get_attn_backend(
        return IpexAttnBackend
    elif backend == _Backend.FLASHINFER:
        logger.info("Using Flashinfer backend.")
-        logger.warning("Eager mode is required for the Flashinfer backend. "
-                       "Please make sure --enforce-eager is set.")
+        logger.warning(("Flashinfer will be stuck on llma-2-7b,"
+                        " please avoid using Flashinfer as the"
+                        "backend when running on llma-2-7b."))
        from vllm.attention.backends.flashinfer import FlashInferBackend
        return FlashInferBackend
    elif backend == _Backend.PALLAS: