[Frontend][torch.compile] CompilationConfig Overhaul (#20283): Set up -O infrastructure (#26847)

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: adabeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Morrison Turnansky
2025-11-27 04:55:58 -05:00
committed by GitHub
parent 00d3310d2d
commit 0838b52e2e
13 changed files with 735 additions and 64 deletions

View File

@@ -8,7 +8,7 @@ from dataclasses import asdict, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Literal
from pydantic import TypeAdapter, field_validator
from pydantic import Field, TypeAdapter, field_validator
from pydantic.dataclasses import dataclass
import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:
This is separate from general `CompilationConfig` so that inductor passes
don't all have access to full configuration - that would create a cycle as
the `PassManager` is set as a property of config."""
the `PassManager` is set as a property of config.
enable_fusion: bool = False
You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
constructor. VLLMConfig's post_init does further initialization.
If used outside of the VLLMConfig, some fields may be left in an
improper state.
"""
enable_fusion: bool = Field(default=None)
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
enable_attn_fusion: bool = False
enable_attn_fusion: bool = Field(default=None)
"""Whether to enable the custom attention+quant fusion pass."""
enable_noop: bool = False
enable_noop: bool = Field(default=None)
"""Whether to enable the custom no-op elimination pass."""
enable_sequence_parallelism: bool = False
enable_sequence_parallelism: bool = Field(default=None)
"""Whether to enable sequence parallelism."""
enable_async_tp: bool = False
enable_async_tp: bool = Field(default=None)
"""Whether to enable async TP."""
enable_fi_allreduce_fusion: bool = False
enable_fi_allreduce_fusion: bool = Field(default=None)
"""Whether to enable flashinfer allreduce fusion."""
fi_allreduce_fusion_max_size_mb: float | None = None
"""The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ class PassConfig:
"""
return InductorPass.hash_dict(asdict(self))
@field_validator(
"enable_fusion",
"enable_attn_fusion",
"enable_noop",
"enable_sequence_parallelism",
"enable_async_tp",
"enable_fi_allreduce_fusion",
mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
def __post_init__(self) -> None:
if not self.enable_noop:
if self.enable_fusion:
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
@config
@dataclass
class CompilationConfig:
"""Configuration for compilation. It has three parts:
"""Configuration for compilation.
You must pass CompilationConfig to VLLMConfig constructor.
VLLMConfig's post_init does further initialization. If used outside of the
VLLMConfig, some fields will be left in an improper state.
It has three parts:
- Top-level Compilation control:
- [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
"""
# Top-level Compilation control
level: int | None = None
level: int = Field(default=None)
"""
Level is deprecated and will be removed in the next release,
either 0.12.0 or 0.11.2 whichever is soonest.
Please use mode. Currently all levels are mapped to mode.
"""
# Top-level Compilation control
mode: CompilationMode | None = None
mode: CompilationMode = Field(default=None)
"""The compilation approach used for torch.compile-based compilation of the
model.
@@ -390,7 +418,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: CUDAGraphMode | None = None
cudagraph_mode: CUDAGraphMode = Field(default=None)
"""
The mode of the cudagraph:
@@ -452,7 +480,7 @@ class CompilationConfig:
When `enable_lora` is False, this option has no effect.
"""
use_inductor_graph_partition: bool = False
use_inductor_graph_partition: bool = Field(default=None)
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
This partition happens at inductor codegen time after all passes and fusions
are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ class CompilationConfig:
)
return value
@field_validator(
"level",
"mode",
"cudagraph_mode",
"use_inductor_graph_partition",
mode="wrap",
)
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
def __post_init__(self) -> None:
if self.level is not None:
logger.warning(
@@ -948,6 +990,13 @@ class CompilationConfig:
op,
)
def is_custom_op_enabled(self, op: str) -> bool:
if "all" in self.custom_ops:
return f"-{op}" not in self.custom_ops
assert "none" in self.custom_ops
return f"+{op}" in self.custom_ops
def adjust_cudagraph_sizes_for_spec_decode(
self, uniform_decode_query_len: int, tensor_parallel_size: int
):

View File

@@ -1752,6 +1752,14 @@ class ModelConfig:
logger.info("Using max model len %s", max_model_len)
return max_model_len
def is_model_moe(
self,
) -> bool:
return self.get_num_experts() > 1
def is_quantized(self) -> bool:
return getattr(self.hf_config, "quantization_config", None) is not None
def get_served_model_name(model: str, served_model_name: str | list[str] | None):
"""

View File

@@ -9,8 +9,9 @@ import tempfile
import threading
import time
from contextlib import contextmanager
from dataclasses import replace
from dataclasses import is_dataclass, replace
from datetime import datetime
from enum import IntEnum
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar, get_args
@@ -57,6 +58,103 @@ else:
logger = init_logger(__name__)
class OptimizationLevel(IntEnum):
"""Optimization level enum."""
O0 = 0
"""O0 : No optimization. no compilation, no cudagraphs, no other
optimization, just starting up immediately"""
O1 = 1
"""O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise
cudagraphs"""
O2 = 2
"""O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
O3 = 3
"""O3: Currently the same as -O2s."""
IS_QUANTIZED = False
IS_DENSE = False
# The optimizations that depend on these properties currently set to False
# in all cases.
# if model_config is not None:
# IS_QUANTIZED = lambda c: c.model_config.is_quantized()
# IS_DENSE = lambda c: not c.model_config.is_model_moe()
# See https://github.com/vllm-project/vllm/issues/25689.
def enable_fusion(cfg: "VllmConfig") -> bool:
"""Returns True if RMS norm or quant FP8 is enabled."""
return cfg.compilation_config.is_custom_op_enabled(
"rms_norm"
) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
OPTIMIZATION_LEVEL_00 = {
"compilation_config": {
"pass_config": {
"enable_noop": False,
"enable_fusion": False,
"enable_fi_allreduce_fusion": False,
"enable_attn_fusion": False,
"enable_sequence_parallelism": False,
"enable_async_tp": False,
},
"cudagraph_mode": CUDAGraphMode.NONE,
"use_inductor_graph_partition": False,
},
}
OPTIMIZATION_LEVEL_01 = {
"compilation_config": {
"pass_config": {
"enable_noop": True,
"enable_fusion": enable_fusion,
"enable_fi_allreduce_fusion": False,
"enable_attn_fusion": False,
"enable_sequence_parallelism": False,
"enable_async_tp": False,
},
"cudagraph_mode": CUDAGraphMode.PIECEWISE,
"use_inductor_graph_partition": False,
},
}
OPTIMIZATION_LEVEL_02 = {
"compilation_config": {
"pass_config": {
"enable_noop": True,
"enable_fusion": enable_fusion,
"enable_fi_allreduce_fusion": False,
"enable_attn_fusion": IS_QUANTIZED,
"enable_sequence_parallelism": IS_DENSE,
"enable_async_tp": IS_DENSE,
},
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
"use_inductor_graph_partition": False,
},
}
OPTIMIZATION_LEVEL_03 = {
"compilation_config": {
"pass_config": {
"enable_noop": True,
"enable_fusion": enable_fusion,
"enable_fi_allreduce_fusion": False,
"enable_attn_fusion": IS_QUANTIZED,
"enable_sequence_parallelism": IS_DENSE,
"enable_async_tp": IS_DENSE,
},
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
"use_inductor_graph_partition": False,
},
}
OPTIMIZATION_LEVEL_TO_CONFIG = {
OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
}
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class VllmConfig:
@@ -116,6 +214,11 @@ class VllmConfig:
you are using. Contents must be hashable."""
instance_id: str = ""
"""The ID of the vLLM instance."""
optimization_level: OptimizationLevel = OptimizationLevel.O2
"""The optimization level. These levels trade startup time cost for
performance, with -O0 having the best startup time and -O3 having the best
performance. -02 is used by defult. See OptimizationLevel for full
description."""
def compute_hash(self) -> str:
"""
@@ -297,6 +400,50 @@ class VllmConfig:
return replace(self, model_config=model_config)
def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
"""Set config attribute to default if not already set by user.
Args:
config_obj: Configuration object to update.
key: Attribute name.
value: Default value (static or callable).
"""
if getattr(config_obj, key) is None:
# Some config values are known before initialization and are
# hard coded.
# Other values depend on the user given configuration, so they are
# implemented with lambda functions and decided at run time.
setattr(config_obj, key, value(self) if callable(value) else value)
def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
"""Apply optimization level defaults using self as root.
Recursively applies values from defaults into nested config objects.
Only fields present in defaults are overwritten.
If the user configuration does not specify a value for a default field
and if the default field is still None after all user selections are
applied, then default values will be applied to the field. User speciied
fields will not be overridden by the default.
Args:
defaults: Dictionary of default values to apply.
"""
def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
"""Recursively apply defaults to config_obj, using self as root."""
for key, value in config_defaults.items():
if not hasattr(config_obj, key):
continue
current = getattr(config_obj, key)
if isinstance(value, dict) and is_dataclass(current):
apply_recursive(current, value)
else:
self._set_config_default(config_obj, key, value)
apply_recursive(self, defaults)
def _post_init_kv_transfer_config(self) -> None:
"""Update KVTransferConfig based on top-level configs in VllmConfig.
@@ -434,17 +581,47 @@ class VllmConfig:
"precision for chunked prefill triton kernels."
)
# If the user does not explicitly set a compilation mode, then
# we use the default mode. The default mode depends on other
# settings (see the below code).
if (
self.optimization_level > OptimizationLevel.O0
and self.model_config is not None
and self.model_config.enforce_eager
):
logger.warning("Enforce eager set, overriding optimization level to -O0")
self.optimization_level = OptimizationLevel.O0
if self.compilation_config.backend == "eager" or (
self.compilation_config.mode is not None
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
):
logger.warning(
"Inductor compilation was disabled by user settings,"
"Optimizations settings that are only active during"
"Inductor compilation will be ignored."
)
def has_blocked_weights():
if self.quant_config is not None:
if hasattr(self.quant_config, "weight_block_size"):
return self.quant_config.weight_block_size is not None
elif hasattr(self.quant_config, "has_blocked_weights"):
return self.quant_config.has_blocked_weights()
return False
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
# On H100 the CUDA kernel is faster than
# native implementation
# https://github.com/vllm-project/vllm/issues/25094
if has_blocked_weights():
custom_ops = self.compilation_config.custom_ops
if "-quant_fp8" not in custom_ops:
custom_ops.append("+quant_fp8")
if self.compilation_config.mode is None:
if self.model_config is not None and not self.model_config.enforce_eager:
if self.optimization_level > OptimizationLevel.O0:
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
else:
self.compilation_config.mode = CompilationMode.NONE
# If user does not set custom ops via none or all set it here based on
# compilation mode and backend.
if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
if (
self.compilation_config.backend == "inductor"
@@ -454,23 +631,33 @@ class VllmConfig:
else:
self.compilation_config.custom_ops.append("all")
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
self._apply_optimization_level_defaults(default_config)
if (
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
):
logger.info(
"Cudagraph mode %s is not compatible with compilation mode %s."
"Overriding to NONE.",
self.compilation_config.cudagraph_mode,
self.compilation_config.mode,
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
# async tp is built on top of sequence parallelism
# and requires it to be enabled.
if self.compilation_config.pass_config.enable_async_tp:
self.compilation_config.pass_config.enable_sequence_parallelism = True
if self.compilation_config.pass_config.enable_sequence_parallelism:
if "-rms_norm" in self.compilation_config.custom_ops:
logger.warning(
"RMS norm force disabled, sequence parallelism might break"
)
else:
self.compilation_config.custom_ops.append("+rms_norm")
if current_platform.support_static_graph_mode():
# if cudagraph_mode is not explicitly set by users, set default
# value
if self.compilation_config.cudagraph_mode is None:
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
# default to full and piecewise for most models
self.compilation_config.cudagraph_mode = (
CUDAGraphMode.FULL_AND_PIECEWISE
)
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
# if cudagraph_mode has full cudagraphs, we need to check support
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
# decode context parallel does not support full cudagraphs