[Frontend][torch.compile] CompilationConfig Overhaul (#20283): Set up -O infrastructure (#26847)

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: adabeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Morrison Turnansky
2025-11-27 04:55:58 -05:00
committed by GitHub
parent 00d3310d2d
commit 0838b52e2e
13 changed files with 735 additions and 64 deletions

View File

@@ -8,7 +8,7 @@ from dataclasses import asdict, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Literal
from pydantic import TypeAdapter, field_validator
from pydantic import Field, TypeAdapter, field_validator
from pydantic.dataclasses import dataclass
import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:
This is separate from general `CompilationConfig` so that inductor passes
don't all have access to full configuration - that would create a cycle as
the `PassManager` is set as a property of config."""
the `PassManager` is set as a property of config.
enable_fusion: bool = False
You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
constructor. VLLMConfig's post_init does further initialization.
If used outside of the VLLMConfig, some fields may be left in an
improper state.
"""
enable_fusion: bool = Field(default=None)
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
enable_attn_fusion: bool = False
enable_attn_fusion: bool = Field(default=None)
"""Whether to enable the custom attention+quant fusion pass."""
enable_noop: bool = False
enable_noop: bool = Field(default=None)
"""Whether to enable the custom no-op elimination pass."""
enable_sequence_parallelism: bool = False
enable_sequence_parallelism: bool = Field(default=None)
"""Whether to enable sequence parallelism."""
enable_async_tp: bool = False
enable_async_tp: bool = Field(default=None)
"""Whether to enable async TP."""
enable_fi_allreduce_fusion: bool = False
enable_fi_allreduce_fusion: bool = Field(default=None)
"""Whether to enable flashinfer allreduce fusion."""
fi_allreduce_fusion_max_size_mb: float | None = None
"""The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ class PassConfig:
"""
return InductorPass.hash_dict(asdict(self))
@field_validator(
"enable_fusion",
"enable_attn_fusion",
"enable_noop",
"enable_sequence_parallelism",
"enable_async_tp",
"enable_fi_allreduce_fusion",
mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
def __post_init__(self) -> None:
if not self.enable_noop:
if self.enable_fusion:
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
@config
@dataclass
class CompilationConfig:
"""Configuration for compilation. It has three parts:
"""Configuration for compilation.
You must pass CompilationConfig to VLLMConfig constructor.
VLLMConfig's post_init does further initialization. If used outside of the
VLLMConfig, some fields will be left in an improper state.
It has three parts:
- Top-level Compilation control:
- [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
"""
# Top-level Compilation control
level: int | None = None
level: int = Field(default=None)
"""
Level is deprecated and will be removed in the next release,
either 0.12.0 or 0.11.2 whichever is soonest.
Please use mode. Currently all levels are mapped to mode.
"""
# Top-level Compilation control
mode: CompilationMode | None = None
mode: CompilationMode = Field(default=None)
"""The compilation approach used for torch.compile-based compilation of the
model.
@@ -390,7 +418,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: CUDAGraphMode | None = None
cudagraph_mode: CUDAGraphMode = Field(default=None)
"""
The mode of the cudagraph:
@@ -452,7 +480,7 @@ class CompilationConfig:
When `enable_lora` is False, this option has no effect.
"""
use_inductor_graph_partition: bool = False
use_inductor_graph_partition: bool = Field(default=None)
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
This partition happens at inductor codegen time after all passes and fusions
are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ class CompilationConfig:
)
return value
@field_validator(
"level",
"mode",
"cudagraph_mode",
"use_inductor_graph_partition",
mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
def __post_init__(self) -> None:
if self.level is not None:
logger.warning(
@@ -948,6 +990,13 @@ class CompilationConfig:
op,
)
def is_custom_op_enabled(self, op: str) -> bool:
if "all" in self.custom_ops:
return f"-{op}" not in self.custom_ops
assert "none" in self.custom_ops
return f"+{op}" in self.custom_ops
def adjust_cudagraph_sizes_for_spec_decode(
self, uniform_decode_query_len: int, tensor_parallel_size: int
):