[compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681)

Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-13 18:15:34 -07:00
parent 3e051bda82
commit b59dd19b55
5 changed files with 34 additions and 5 deletions
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -482,7 +482,25 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
            ).register(self.patterns)
        self.dump_patterns(config, self.patterns)

-    def is_applicable_for_shape(self, shape: int | None) -> bool:
+    def is_applicable(self, shape: int | None) -> bool:
+        # When sequence parallelism is enabled, the residual tensor from RMSNorm
+        # needs to be split along the sequence dimension. However, this dimension
+        # is symbolic during piecewise compilation, and splitting symbolic shapes
+        # is not supported.
+        #
+        # This pass is therefore only applied when the sequence dimension is
+        # concrete:
+        # 1. In full-graph compilation mode (no Dynamo splitting ops are used).
+        #   For this case we always pad num_tokens to be a multiple of
+        #   tensor_parallel_size, so there's no need to check shape % tp_size == 0.
+        # 2. For specific shape provided during compilation (e.g., from
+        #    `compile_sizes`), which must be divisible by the tensor-parallel
+        #    size.
+        if (
+            not self.compilation_config.splitting_ops
+            or self.compilation_config.use_inductor_graph_partition
+        ):
+            return True
        tp_size = get_tensor_model_parallel_world_size()
        return shape is not None and shape % tp_size == 0