[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)

Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-14 22:51:16 -04:00
parent e66d787bce
commit 96b9aa5aa0
42 changed files with 270 additions and 248 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -4,7 +4,7 @@
 from vllm.config.cache import CacheConfig
 from vllm.config.compilation import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    PassConfig,
 )
@@ -49,7 +49,7 @@ __all__ = [
    "CacheConfig",
    # From vllm.config.compilation
    "CompilationConfig",
-    "CompilationLevel",
+    "CompilationMode",
    "CUDAGraphMode",
    "PassConfig",
    # From vllm.config.device
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -26,12 +26,20 @@ else:
 logger = init_logger(__name__)


-class CompilationLevel:
-    # constants for the levels of the compilation process
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
+class CompilationMode:
+    """The compilation approach used for torch.compile-based compilation of the
+    model."""
+
+    NONE = 0
+    """No torch.compile compilation is applied, model runs in fully eager pytorch mode.
+    The model runs as-is."""
+    STOCK_TORCH_COMPILE = 1
+    """The standard `torch.compile` compilation pipeline."""
+    DYNAMO_TRACE_ONCE = 2
+    """Single Dynamo trace through the model, avoiding recompilation."""
+    VLLM_COMPILE = 3
+    """Custom vLLM Inductor-based backend with caching, piecewise compilation,
+    shape specialization, and custom passes."""


 class CUDAGraphMode(enum.Enum):
@@ -134,7 +142,7 @@ class CompilationConfig:
    """Configuration for compilation. It has three parts:

    - Top-level Compilation control:
-        - [`level`][vllm.config.CompilationConfig.level]
+        - [`mode`][vllm.config.CompilationConfig.mode]
        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
        - [`backend`][vllm.config.CompilationConfig.backend]
@@ -171,14 +179,26 @@ class CompilationConfig:

    # Top-level Compilation control
    level: int | None = None
-    """The level of compilation:
+    """
+    Level is deprecated and will be removed in the next release,
+    either 0.12.0 or 0.11.2 whichever is soonest.
+    Please use mode. Currently all levels are mapped to mode.
+    """
+    # Top-level Compilation control
+    mode: int | None = None
+    """The compilation approach used for torch.compile-based compilation of the
+    model.

-    - None: If None, we will select the default compilation level.
-      For V1 engine this is 3, for V0 engine this is 0.
-    - 0: no compilation.
-    - 1: dynamo as is.
-    - 2: dynamo once.
-    - 3: piecewise compilation."""
+    - None: If None, we will select the default compilation mode.
+      For V1 engine this is 3.
+    - 0: NONE: No torch.compile compilation is applied, model runs in fully
+         eager pytorch mode. The model runs as-is.
+    - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
+    - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
+         recompilation by removing guards.
+         Requires no dynamic-shape-dependent control-flow.
+    - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
+         piecewise compilation, shape specialization, and custom passes."""
    debug_dump_path: Path | None = None
    """The path to dump the debug information."""
    cache_dir: str = ""
@@ -195,11 +215,11 @@ class CompilationConfig:

    backend function.
    We use string to avoid serialization issues when using compilation in a
-    distributed setting. When the compilation level is 1 or 2, the backend is
+    distributed setting. When the compilation mode is 1 or 2, the backend is
    used for the compilation directly (it sees the whole graph). When the
-    compilation level is 3, the backend is used for the piecewise compilation
+    compilation mode is 3, the backend is used for the piecewise compilation
    (it sees a part of the graph). The backend can not be custom for compilation
-    level 3, i.e. the backend must be either eager or inductor. Furthermore,
+    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
    compilation is only piecewise if splitting ops is set accordingly and
    use_inductor_graph_partition is off. Note that the default options for
    splitting ops are sufficient for piecewise compilation.
@@ -214,7 +234,7 @@ class CompilationConfig:
    - 'none,+op1,+op2' to enable only op1 and op2

    By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
    Inductor generates (fused) Triton kernels for disabled custom ops."""
    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -249,7 +269,7 @@ class CompilationConfig:
        One graph for symbolic shape and one graph per size in compile_sizes
        are compiled using configurations in inductor_compile_config.

-    This setting is ignored if level<PIECEWISE.
+    This setting is ignored if mode<VLLM_COMPILE.

    For future compatibility:
    If use_inductor is True, backend="inductor" otherwise backend="eager".
@@ -299,7 +319,7 @@ class CompilationConfig:
    Currently, the cudagraph mode is only used for the v1 engine.
    Note that the cudagraph logic is generally orthogonal to the 
    compilation logic. While piecewise cudagraphs require piecewise 
-    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
    cudagraphs are supported with and without compilation.
    
    Warning: This flag is new and subject to change in addition 
@@ -312,7 +332,7 @@ class CompilationConfig:
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
    In the vLLM V1 Engine, this flag only applies for
-    CompilationLevel.PIECEWISE (aka -O3).
+    CompilationMode.VLLM_COMPILE (aka -O3).
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    Warning: This flag is deprecated and will be removed in the next major or
@@ -426,7 +446,7 @@ class CompilationConfig:
        the final hidden states.
        """
        factors: list[Any] = []
-        factors.append(self.level)
+        factors.append(self.mode)
        factors.append(self.backend)
        factors.append(self.custom_ops)
        factors.append(self.splitting_ops)
@@ -477,6 +497,17 @@ class CompilationConfig:
        return value

    def __post_init__(self) -> None:
+        if self.level is not None:
+            logger.warning(
+                "Level is deprecated and will be removed in the next release,"
+                "either 0.12.0 or 0.11.2 whichever is soonest."
+                "Use mode instead."
+                "If both level and mode are given,"
+                "only mode will be used."
+            )
+            if self.mode is None:
+                self.mode = self.level
+
        count_none = self.custom_ops.count("none")
        count_all = self.custom_ops.count("all")
        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
@@ -574,7 +605,7 @@ class CompilationConfig:
        # Currently only eager and inductor backend are supported.
        # for piecewise compilation. Custom backends are not suppported for
        # piecewise compilation. Update when more backends are supported.
-        if self.level == CompilationLevel.PIECEWISE and self.backend not in [
+        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
            "",
            "eager",
            "inductor",
@@ -602,24 +633,27 @@ class CompilationConfig:
        Returns:
            The backend for the compilation config.
        """
-        if self.level is None:
+        if self.mode is None:
            raise ValueError(
-                "No compilation level is set. This method should only be \
+                "No compilation mode is set. This method should only be \
                called via vllm config where the level is set if none is \
                provided."
            )
-        if self.level == CompilationLevel.NO_COMPILATION:
-            raise ValueError("No compilation level is set.")
+        if self.mode == CompilationMode.NONE:
+            raise ValueError("No compilation mode is set.")

        from torch._dynamo.backends.registry import list_backends

        torch_backends = list_backends(exclude_tags=tuple())
-        if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
+        if self.mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+        ]:
            if self.backend in torch_backends:
                return self.backend
            return resolve_obj_by_qualname(self.backend)

-        assert self.level == CompilationLevel.PIECEWISE
+        assert self.mode == CompilationMode.VLLM_COMPILE
        if self.backend not in ["eager", "inductor"]:
            raise ValueError(
                f"Invalid backend for piecewise compilation: {self.backend}"
@@ -684,11 +718,11 @@ class CompilationConfig:
        self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size

    def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called only when level is
-        # CompilationLevel.PIECEWISE
-        assert self.level == CompilationLevel.PIECEWISE, (
+        # NOTE: this function needs to be called only when mode is
+        # CompilationMode.VLLM_COMPILE
+        assert self.mode == CompilationMode.VLLM_COMPILE, (
            "set_splitting_ops_for_v1 should only be called when "
-            "level is CompilationLevel.PIECEWISE"
+            "mode is CompilationMode.VLLM_COMPILE"
        )

        if self.use_inductor_graph_partition:
@@ -769,12 +803,10 @@ class CompilationConfig:

        if not self.use_inductor_graph_partition:
            # Dynamo-level FX split case
-            return self.level == CompilationLevel.PIECEWISE
+            return self.mode == CompilationMode.VLLM_COMPILE

        # Inductor partition case
-        return (
-            self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
-        )
+        return self.backend == "inductor" and self.mode > CompilationMode.NONE

    def custom_op_log_check(self):
        """
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -22,7 +22,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid

 from .cache import CacheConfig
-from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode
+from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
@@ -84,17 +84,11 @@ class VllmConfig:
    compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
    """`torch.compile` and cudagraph capture configuration for the model.

-    As a shorthand, `-O<n>` can be used to directly specify the compilation
-    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
-    Currently, -O <n> and -O=<n> are supported as well but this will likely be
-    removed in favor of clearer -O<n> syntax in the future.
-
-    NOTE: level 0 is the default level without any optimization. level 1 and 2
-    are for internal testing only. level 3 is the recommended level for
-    production, also default in V1.
+    As a shorthand, one can append compilation arguments via 
+    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).

    You can specify the full compilation config like so:
-    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
+    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    kv_transfer_config: KVTransferConfig | None = None
    """The configurations for distributed KV cache transfer."""
@@ -305,33 +299,33 @@ class VllmConfig:
                "precision for chunked prefill triton kernels."
            )

-        # If the user does not explicitly set a compilation level, then
-        # we use the default level. The default level depends on other
+        # If the user does not explicitly set a compilation mode, then
+        # we use the default mode. The default mode depends on other
        # settings (see the below code).
-        if self.compilation_config.level is None:
+        if self.compilation_config.mode is None:
            if envs.VLLM_USE_V1:
                if (
                    self.model_config is not None
                    and not self.model_config.enforce_eager
                ):
-                    self.compilation_config.level = CompilationLevel.PIECEWISE
+                    self.compilation_config.mode = CompilationMode.VLLM_COMPILE
                else:
-                    self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                    self.compilation_config.mode = CompilationMode.NONE

            else:
-                # NB: Passing both --enforce-eager and a compilation level
-                # in V0 means the compilation level wins out.
-                self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                # NB: Passing both --enforce-eager and a compilation mode
+                # in V0 means the compilation mode wins out.
+                self.compilation_config.mode = CompilationMode.NONE
        else:
-            assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION
-            assert self.compilation_config.level <= CompilationLevel.PIECEWISE
+            assert self.compilation_config.mode >= CompilationMode.NONE
+            assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE

        # If user does not set custom ops via none or all set it here based on
-        # compilation level and backend.
+        # compilation mode and backend.
        if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
            if (
                self.compilation_config.backend == "inductor"
-                and self.compilation_config.level > CompilationLevel.NO_COMPILATION
+                and self.compilation_config.mode > CompilationMode.NONE
            ):
                self.compilation_config.custom_ops.append("none")
            else:
@@ -350,7 +344,7 @@ class VllmConfig:
            if self.compilation_config.cudagraph_mode is None:
                if (
                    envs.VLLM_USE_V1
-                    and self.compilation_config.level == CompilationLevel.PIECEWISE
+                    and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                ):
                    # default to full and piecewise for most models
                    self.compilation_config.cudagraph_mode = (
@@ -486,10 +480,10 @@ class VllmConfig:
            )
        current_platform.check_and_update_config(self)

-        # Do this after all the updates to compilation_config.level
+        # Do this after all the updates to compilation_config.mode
        if (
            envs.VLLM_USE_V1
-            and self.compilation_config.level == CompilationLevel.PIECEWISE
+            and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
        ):
            self.compilation_config.set_splitting_ops_for_v1()

@@ -508,8 +502,8 @@ class VllmConfig:
                )

            if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
-                assert self.compilation_config.level == CompilationLevel.PIECEWISE, (
-                    "Compilation level should be CompilationLevel.PIECEWISE "
+                assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
+                    "Compilation mode should be CompilationMode.VLLM_COMPILE "
                    "when cudagraph_mode piecewise cudagraphs is used, "
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                )
@@ -837,7 +831,7 @@ def set_current_vllm_config(

        if (
            check_compile
-            and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
+            and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
            and compilation_counter.num_models_seen == num_models_seen
        ):
            # If the model supports compilation,