[Feature] support sequence parallelism using compilation pass (#16155)

Signed-off-by: cascade812 <cascade812@outlook.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-04-27 06:29:35 -07:00
parent ed7a29d9f8
commit 690fe019f0
21 changed files with 1072 additions and 44 deletions
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -4,6 +4,7 @@ import hashlib
 import inspect
 import json
 import types
+from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Union

 import torch
@@ -18,6 +19,34 @@ else:
    from .torch25_custom_graph_pass import (  # noqa: yapf
        Torch25CustomGraphPass as CustomGraphPass)

+_pass_context = None
+
+
+class PassContext:
+
+    def __init__(self, runtime_shape: Optional[int]):
+        self.runtime_shape = runtime_shape
+
+
+def get_pass_context() -> PassContext:
+    """Get the current pass context."""
+    assert _pass_context is not None
+    return _pass_context
+
+
+@contextmanager
+def pass_context(runtime_shape: Optional[int]):
+    """A context manager that stores the current pass context,
+    usually it is a list of sizes to specialize.
+    """
+    global _pass_context
+    prev_context = _pass_context
+    _pass_context = PassContext(runtime_shape)
+    try:
+        yield
+    finally:
+        _pass_context = prev_context
+

 class InductorPass(CustomGraphPass):
    """
@@ -62,6 +91,9 @@ class InductorPass(CustomGraphPass):
        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
        return hashlib.sha256(encoded).hexdigest()

+    def is_applicable_for_shape(self, shape: Optional[int]):
+        return True
+

 class CallableInductorPass(InductorPass):
    """