Disable Cascade Attention for Batch Invariance (#32561)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com> Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
@@ -7,7 +7,6 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
|
||||
|
||||
skip_unsupported = pytest.mark.skipif(
|
||||
@@ -22,8 +21,10 @@ BACKENDS: list[str] = [
|
||||
"TRITON_MLA",
|
||||
]
|
||||
|
||||
if has_flashinfer():
|
||||
BACKENDS.append("FLASHINFER")
|
||||
# FlashInfer temporarily disabled due to invariant CTA sizes.
|
||||
# See FlashInfer issue #2424
|
||||
# if has_flashinfer():
|
||||
# BACKENDS.append("FLASHINFER")
|
||||
|
||||
if flash_attn_supports_mla():
|
||||
BACKENDS.append("FLASH_ATTN_MLA")
|
||||
@@ -78,9 +79,10 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
# For longer prompts, repeat context
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. "
|
||||
# TODO: Update to * (target_words // 10) to better align with word ratio
|
||||
* (target_words // 50)
|
||||
)
|
||||
base_prompt = base_prompt + padding_text
|
||||
base_prompt = padding_text + base_prompt
|
||||
|
||||
return base_prompt
|
||||
|
||||
|
||||
Reference in New Issue
Block a user