Disable Cascade Attention for Batch Invariance (#32561)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com> Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
@@ -959,6 +959,18 @@ class VllmConfig:
|
||||
"when cudagraph_mode piecewise cudagraphs is used, "
|
||||
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
||||
)
|
||||
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
|
||||
|
||||
if (
|
||||
self.model_config
|
||||
and vllm_is_batch_invariant()
|
||||
and not self.model_config.disable_cascade_attn
|
||||
):
|
||||
self.model_config.disable_cascade_attn = True
|
||||
logger.warning_once(
|
||||
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if self.parallel_config.use_ubatching:
|
||||
a2a_backend = self.parallel_config.all2all_backend
|
||||
|
||||
Reference in New Issue
Block a user