[Kernel][Model] logits_soft_cap for Gemma2 with flashinfer (#6051)

Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
Lily Liu
2024-07-04 16:35:51 -07:00
committed by GitHub
parent 81d7a50f24
commit 69ec3ca14c
6 changed files with 279 additions and 20 deletions

View File

@@ -77,9 +77,9 @@ def get_attn_backend(
return IpexAttnBackend
elif backend == _Backend.FLASHINFER:
logger.info("Using Flashinfer backend.")
logger.warning(("Flashinfer will be stuck on llma-2-7b,"
" please avoid using Flashinfer as the"
"backend when running on llma-2-7b."))
logger.warning(("Flashinfer will be stuck on llama-2-7b,"
" please avoid using Flashinfer as the "
"backend when running on llama-2-7b."))
from vllm.attention.backends.flashinfer import FlashInferBackend
return FlashInferBackend
elif backend == _Backend.PALLAS: