[compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681)

Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-13 18:15:34 -07:00
parent 3e051bda82
commit b59dd19b55
5 changed files with 34 additions and 5 deletions
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -71,9 +71,11 @@ class PostGradPassManager(CustomGraphPass):

        shape = get_pass_context().runtime_shape
        for pass_ in self.passes:
-            if pass_.is_applicable_for_shape(shape):
+            if pass_.is_applicable(shape):
                pass_(graph)
                VllmInductorPass.dump_prefix += 1
+            else:
+                logger.debug("Skipping %s with shape %s", pass_, shape)

        # post-cleanup goes before fix_functionalization
        # because it requires a functional graph