Revert "[Attention][MLA] Make FLASHINFER_MLA the default MLA backen… (#32484)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
@@ -450,6 +450,7 @@ def use_flashinfer_prefill() -> bool:
|
||||
not vllm_config.attention_config.disable_flashinfer_prefill
|
||||
and flashinfer_available
|
||||
and not vllm_config.attention_config.use_cudnn_prefill
|
||||
and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
|
||||
and current_platform.is_device_capability_family(100)
|
||||
)
|
||||
|
||||
@@ -1323,27 +1324,25 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if use_trtllm_ragged_deepseek_prefill():
|
||||
logger.info_once(
|
||||
"Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
|
||||
)
|
||||
if use_flashinfer_prefill():
|
||||
logger.debug_once("Using FlashInfer prefill for MLA")
|
||||
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
|
||||
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
|
||||
self._pad_v = False
|
||||
elif use_trtllm_ragged_deepseek_prefill():
|
||||
logger.debug_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
|
||||
self._run_prefill_context_chunk = (
|
||||
self._run_prefill_context_chunk_trtllm_ragged
|
||||
)
|
||||
self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
|
||||
self._pad_v = False
|
||||
elif use_flashinfer_prefill():
|
||||
logger.info_once("Using FlashInfer prefill for MLA")
|
||||
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
|
||||
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
|
||||
self._pad_v = False
|
||||
elif use_cudnn_prefill():
|
||||
logger.info_once("Using CUDNN prefill for MLA", scope="local")
|
||||
logger.debug_once("Using CUDNN prefill for MLA")
|
||||
self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
|
||||
self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
|
||||
self._pad_v = False
|
||||
else: # Use FlashAttention
|
||||
logger.info_once("Using FlashAttention prefill for MLA", scope="local")
|
||||
logger.debug_once("Using FlashAttention prefill for MLA")
|
||||
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
|
||||
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
|
||||
|
||||
|
||||
Reference in New Issue
Block a user