Lora 3 & 4 test seems to have illegal memory access failure after this commit;
[2024-05-14 23:51:18,182 E 22 22] logging.cc:101: Unhandled exception: N3c105ErrorE. what(): CUDA error: an illegal memory access was encountered
<br class="Apple-interchange-newline">
Exmaple: https://buildkite.com/vllm/ci/builds/7382#018f793d-1527-4e1c-ab59-c3a34ec55241
This reverts commit 1356df5.
FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (link existing issues this PR will resolve)
This commit is contained in:
@@ -93,20 +93,6 @@ def _which_attn_to_use(
|
||||
"torch.float16 or torch.bfloat16.")
|
||||
return _Backend.XFORMERS
|
||||
|
||||
if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
|
||||
logger.info("Cannot use FlashAttention-2 backend for FP8 KV cache.")
|
||||
return _Backend.XFORMERS
|
||||
|
||||
if block_size % 16 != 0:
|
||||
logger.info("Cannot use FlashAttention-2 backend for block size not "
|
||||
"divisible by 16.")
|
||||
return _Backend.XFORMERS
|
||||
|
||||
if sliding_window is not None:
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend due to sliding window.")
|
||||
return _Backend.XFORMERS
|
||||
|
||||
try:
|
||||
import vllm_flash_attn # noqa: F401
|
||||
except ImportError:
|
||||
|
||||
Reference in New Issue
Block a user