[Kernel][Model] logits_soft_cap for Gemma2 with flashinfer (#6051)
Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
@@ -77,9 +77,9 @@ def get_attn_backend(
|
||||
return IpexAttnBackend
|
||||
elif backend == _Backend.FLASHINFER:
|
||||
logger.info("Using Flashinfer backend.")
|
||||
logger.warning(("Flashinfer will be stuck on llma-2-7b,"
|
||||
" please avoid using Flashinfer as the"
|
||||
"backend when running on llma-2-7b."))
|
||||
logger.warning(("Flashinfer will be stuck on llama-2-7b,"
|
||||
" please avoid using Flashinfer as the "
|
||||
"backend when running on llama-2-7b."))
|
||||
from vllm.attention.backends.flashinfer import FlashInferBackend
|
||||
return FlashInferBackend
|
||||
elif backend == _Backend.PALLAS:
|
||||
|
||||
Reference in New Issue
Block a user