Disable Cascade Attention for Batch Invariance (#32561)

Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
Frank Wang
2026-01-30 07:00:46 -08:00
committed by GitHub
parent ae5b7aff2b
commit 8f5d51203b
6 changed files with 60 additions and 9 deletions

View File

@@ -959,6 +959,18 @@ class VllmConfig:
"when cudagraph_mode piecewise cudagraphs is used, "
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
)
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
if (
self.model_config
and vllm_is_batch_invariant()
and not self.model_config.disable_cascade_attn
):
self.model_config.disable_cascade_attn = True
logger.warning_once(
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
scope="local",
)
if self.parallel_config.use_ubatching:
a2a_backend = self.parallel_config.all2all_backend