Disable Cascade Attention for Batch Invariance (#32561)

Signed-off-by: frankwang28 <frank.wbb@hotmail.com> Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-30 07:00:46 -08:00
parent ae5b7aff2b
commit 8f5d51203b
6 changed files with 60 additions and 9 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -959,6 +959,18 @@ class VllmConfig:
                    "when cudagraph_mode piecewise cudagraphs is used, "
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                )
+        from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+
+        if (
+            self.model_config
+            and vllm_is_batch_invariant()
+            and not self.model_config.disable_cascade_attn
+        ):
+            self.model_config.disable_cascade_attn = True
+            logger.warning_once(
+                "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
+                scope="local",
+            )

        if self.parallel_config.use_ubatching:
            a2a_backend = self.parallel_config.all2all_backend