[FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)

Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-13 18:47:16 -04:00
parent 7200a21cd1
commit e3fdb627d9
8 changed files with 153 additions and 86 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -15,6 +15,7 @@ from pydantic.dataclasses import dataclass
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname

 if TYPE_CHECKING:
@@ -187,7 +188,8 @@ class CompilationConfig:
    backend: str = ""
    """The backend for compilation. It needs to be a string:

-    - "" (empty string): use the default backend.
+    - "" (empty string): use the default backend ("inductor" on CUDA-alike
+    platforms).
    - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
    - "full.module.name": a qualified name which can be used to import the

@@ -196,7 +198,12 @@ class CompilationConfig:
    distributed setting. When the compilation level is 1 or 2, the backend is
    used for the compilation directly (it sees the whole graph). When the
    compilation level is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph)."""
+    (it sees a part of the graph). The backend can not be custom for compilation
+    level 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation is only piecewise if splitting ops is set accordingly and
+    use_inductor_cudagraphs_partition is off. Note that the default options for
+    splitting ops are sufficient for piecewise compilation.
+    """
    custom_ops: list[str] = field(default_factory=list)
    """Fine-grained control over which custom ops to enable/disable. Use 'all'
    to enable all, 'none' to disable all. Also specify a list of custom op
@@ -229,8 +236,12 @@ class CompilationConfig:
    If empty list [], no ops are excluded (suitable for full cudagraphs)."""

    # Inductor capture
-    use_inductor: bool = True
-    """Whether to use inductor compilation:
+    use_inductor: bool | None = None
+    """
+    Whether to use inductor compilation.
+
+    This flag is deprecated and will be removed in the next release 0.12.0.
+    Please use the 'backend' option instead.

    - False: inductor compilation is not used. graph runs in eager
        (custom_ops enabled by default).
@@ -238,7 +249,11 @@ class CompilationConfig:
        One graph for symbolic shape and one graph per size in compile_sizes
        are compiled using configurations in inductor_compile_config.

-    This setting is ignored if level<PIECEWISE."""
+    This setting is ignored if level<PIECEWISE.
+
+    For future compatibility:
+    If use_inductor is True, backend="inductor" otherwise backend="eager".
+    """
    compile_sizes: list[int | str] | None = None
    """Sizes to compile for inductor. In addition
    to integers, it also supports "cudagraph_capture_sizes" to
@@ -545,7 +560,43 @@ class CompilationConfig:
                    "(where 'op' is the registered op name)"
                )

+        # Currently only eager and inductor backend are supported.
+        # for piecewise compilation. Custom backends are not suppported for
+        # piecewise compilation. Update when more backends are supported.
+        if self.level == CompilationLevel.PIECEWISE and self.backend not in [
+            "",
+            "eager",
+            "inductor",
+        ]:
+            raise ValueError(
+                f"Invalid backend for piecewise compilation: {self.backend}"
+            )
+
+        if self.use_inductor is not None:
+            logger.warning_once(
+                "The 'use_inductor' flag is deprecated and will be "
+                "removed in the next release (v0.12.0). "
+                "Please use the 'backend' option instead.",
+            )
+            self.backend = "inductor" if self.use_inductor else "eager"
+
+        if self.backend == "":
+            self.backend = current_platform.simple_compile_backend
+
    def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
+        """
+        Initialize the backend for the compilation config from a vllm config.
+        Arguments:
+            vllm_config: The vllm config to initialize the backend from.
+        Returns:
+            The backend for the compilation config.
+        """
+        if self.level is None:
+            raise ValueError(
+                "No compilation level is set. This method should only be \
+                called via vllm config where the level is set if none is \
+                provided."
+            )
        if self.level == CompilationLevel.NO_COMPILATION:
            raise ValueError("No compilation level is set.")

@@ -553,15 +604,15 @@ class CompilationConfig:

        torch_backends = list_backends(exclude_tags=tuple())
        if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
-            if self.backend == "":
-                return "eager"
            if self.backend in torch_backends:
                return self.backend
            return resolve_obj_by_qualname(self.backend)

-        # TODO: pass user-specified backend to piecewise compilation
-        # merge with the config use_inductor
        assert self.level == CompilationLevel.PIECEWISE
+        if self.backend not in ["eager", "inductor"]:
+            raise ValueError(
+                f"Invalid backend for piecewise compilation: {self.backend}"
+            )

        from vllm.compilation.backends import VllmBackend

@@ -710,7 +761,9 @@ class CompilationConfig:
            return self.level == CompilationLevel.PIECEWISE

        # Inductor partition case
-        return self.level > CompilationLevel.NO_COMPILATION and self.use_inductor
+        return (
+            self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
+        )

    def custom_op_log_check(self):
        """