[Misc] Use vllm-flash-attn instead of flash-attn (#4686)
This commit is contained in:
@@ -76,11 +76,12 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
|
||||
return _Backend.XFORMERS
|
||||
|
||||
try:
|
||||
import flash_attn # noqa: F401
|
||||
import vllm_flash_attn # noqa: F401
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend because the flash_attn "
|
||||
"package is not found. Please install it for better performance.")
|
||||
"Cannot use FlashAttention-2 backend because the vllm_flash_attn "
|
||||
"package is not found. `pip install vllm-flash-attn` for better "
|
||||
"performance.")
|
||||
return _Backend.XFORMERS
|
||||
|
||||
backend_by_env_var = envs.VLLM_ATTENTION_BACKEND
|
||||
|
||||
Reference in New Issue
Block a user