[Kernel][Model] logits_soft_cap for Gemma2 with flashinfer (#6051)

Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-04 16:35:51 -07:00
parent 81d7a50f24
commit 69ec3ca14c
6 changed files with 279 additions and 20 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,9 +77,9 @@ def get_attn_backend(
        return IpexAttnBackend
    elif backend == _Backend.FLASHINFER:
        logger.info("Using Flashinfer backend.")
-        logger.warning(("Flashinfer will be stuck on llma-2-7b,"
-                        " please avoid using Flashinfer as the"
-                        "backend when running on llma-2-7b."))
+        logger.warning(("Flashinfer will be stuck on llama-2-7b,"
+                        " please avoid using Flashinfer as the "
+                        "backend when running on llama-2-7b."))
        from vllm.attention.backends.flashinfer import FlashInferBackend
        return FlashInferBackend
    elif backend == _Backend.PALLAS: