[Feature] support sequence parallelism using compilation pass (#16155)

Signed-off-by: cascade812 <cascade812@outlook.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-04-27 06:29:35 -07:00
parent ed7a29d9f8
commit 690fe019f0
21 changed files with 1072 additions and 44 deletions
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -15,6 +15,8 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer

+from .inductor_pass import pass_context
+

 class CompilerInterface:
    """
@@ -312,11 +314,12 @@ class InductorAdaptor(CompilerInterface):
                    torch._functorch.config.patch(
                        enable_remote_autograd_cache=False))

-            compiled_graph = compile_fx(
-                graph,
-                example_inputs,
-                inner_compile=hijacked_compile_fx_inner,
-                config_patches=current_config)
+            with pass_context(runtime_shape):
+                compiled_graph = compile_fx(
+                    graph,
+                    example_inputs,
+                    inner_compile=hijacked_compile_fx_inner,
+                    config_patches=current_config)

        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
        # compilation cache. So turn off the checks if we disable the