[Frontend][torch.compile] CompilationConfig Overhaul (#20283): Set up -O infrastructure (#26847)

Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: adabeyta <aabeyta@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: adabeyta <aabeyta@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-11-27 04:55:58 -05:00
parent 00d3310d2d
commit 0838b52e2e
13 changed files with 735 additions and 64 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -8,7 +8,7 @@ from dataclasses import asdict, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal

-from pydantic import TypeAdapter, field_validator
+from pydantic import Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass

 import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:

    This is separate from general `CompilationConfig` so that inductor passes
    don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
+    the `PassManager` is set as a property of config.

-    enable_fusion: bool = False
+    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
+    constructor. VLLMConfig's post_init does further initialization.
+    If used outside of the VLLMConfig, some fields may be left in an
+    improper state.
+    """
+
+    enable_fusion: bool = Field(default=None)
    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
+    enable_attn_fusion: bool = Field(default=None)
    """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = False
+    enable_noop: bool = Field(default=None)
    """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
+    enable_sequence_parallelism: bool = Field(default=None)
    """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
+    enable_async_tp: bool = Field(default=None)
    """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
+    enable_fi_allreduce_fusion: bool = Field(default=None)
    """Whether to enable flashinfer allreduce fusion."""
    fi_allreduce_fusion_max_size_mb: float | None = None
    """The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ class PassConfig:
        """
        return InductorPass.hash_dict(asdict(self))

+    @field_validator(
+        "enable_fusion",
+        "enable_attn_fusion",
+        "enable_noop",
+        "enable_sequence_parallelism",
+        "enable_async_tp",
+        "enable_fi_allreduce_fusion",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
    def __post_init__(self) -> None:
        if not self.enable_noop:
            if self.enable_fusion:
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
@config
@dataclass
 class CompilationConfig:
-    """Configuration for compilation. It has three parts:
+    """Configuration for compilation.
+
+    You must pass CompilationConfig to VLLMConfig constructor.
+    VLLMConfig's post_init does further initialization. If used outside of the
+    VLLMConfig, some fields will be left in an improper state.
+
+    It has three parts:

    - Top-level Compilation control:
        - [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
    """

    # Top-level Compilation control
-    level: int | None = None
+    level: int = Field(default=None)
    """
    Level is deprecated and will be removed in the next release,
    either 0.12.0 or 0.11.2 whichever is soonest.
    Please use mode. Currently all levels are mapped to mode.
    """
    # Top-level Compilation control
-    mode: CompilationMode | None = None
+    mode: CompilationMode = Field(default=None)
    """The compilation approach used for torch.compile-based compilation of the
    model.

@@ -390,7 +418,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode = Field(default=None)
    """
    The mode of the cudagraph:

@@ -452,7 +480,7 @@ class CompilationConfig:
    When `enable_lora` is False, this option has no effect.
    """

-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = Field(default=None)
    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
    This partition happens at inductor codegen time after all passes and fusions
    are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ class CompilationConfig:
            )
        return value

+    @field_validator(
+        "level",
+        "mode",
+        "cudagraph_mode",
+        "use_inductor_graph_partition",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
    def __post_init__(self) -> None:
        if self.level is not None:
            logger.warning(
@@ -948,6 +990,13 @@ class CompilationConfig:
                    op,
                )

+    def is_custom_op_enabled(self, op: str) -> bool:
+        if "all" in self.custom_ops:
+            return f"-{op}" not in self.custom_ops
+
+        assert "none" in self.custom_ops
+        return f"+{op}" in self.custom_ops
+
    def adjust_cudagraph_sizes_for_spec_decode(
        self, uniform_decode_query_len: int, tensor_parallel_size: int
    ):