[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e66d787bce
commit
96b9aa5aa0
@@ -4,7 +4,7 @@
|
||||
from vllm.config.cache import CacheConfig
|
||||
from vllm.config.compilation import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
PassConfig,
|
||||
)
|
||||
@@ -49,7 +49,7 @@ __all__ = [
|
||||
"CacheConfig",
|
||||
# From vllm.config.compilation
|
||||
"CompilationConfig",
|
||||
"CompilationLevel",
|
||||
"CompilationMode",
|
||||
"CUDAGraphMode",
|
||||
"PassConfig",
|
||||
# From vllm.config.device
|
||||
|
||||
@@ -26,12 +26,20 @@ else:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class CompilationLevel:
|
||||
# constants for the levels of the compilation process
|
||||
NO_COMPILATION = 0
|
||||
DYNAMO_AS_IS = 1
|
||||
DYNAMO_ONCE = 2
|
||||
PIECEWISE = 3
|
||||
class CompilationMode:
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model."""
|
||||
|
||||
NONE = 0
|
||||
"""No torch.compile compilation is applied, model runs in fully eager pytorch mode.
|
||||
The model runs as-is."""
|
||||
STOCK_TORCH_COMPILE = 1
|
||||
"""The standard `torch.compile` compilation pipeline."""
|
||||
DYNAMO_TRACE_ONCE = 2
|
||||
"""Single Dynamo trace through the model, avoiding recompilation."""
|
||||
VLLM_COMPILE = 3
|
||||
"""Custom vLLM Inductor-based backend with caching, piecewise compilation,
|
||||
shape specialization, and custom passes."""
|
||||
|
||||
|
||||
class CUDAGraphMode(enum.Enum):
|
||||
@@ -134,7 +142,7 @@ class CompilationConfig:
|
||||
"""Configuration for compilation. It has three parts:
|
||||
|
||||
- Top-level Compilation control:
|
||||
- [`level`][vllm.config.CompilationConfig.level]
|
||||
- [`mode`][vllm.config.CompilationConfig.mode]
|
||||
- [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
|
||||
- [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
|
||||
- [`backend`][vllm.config.CompilationConfig.backend]
|
||||
@@ -171,14 +179,26 @@ class CompilationConfig:
|
||||
|
||||
# Top-level Compilation control
|
||||
level: int | None = None
|
||||
"""The level of compilation:
|
||||
"""
|
||||
Level is deprecated and will be removed in the next release,
|
||||
either 0.12.0 or 0.11.2 whichever is soonest.
|
||||
Please use mode. Currently all levels are mapped to mode.
|
||||
"""
|
||||
# Top-level Compilation control
|
||||
mode: int | None = None
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
- None: If None, we will select the default compilation level.
|
||||
For V1 engine this is 3, for V0 engine this is 0.
|
||||
- 0: no compilation.
|
||||
- 1: dynamo as is.
|
||||
- 2: dynamo once.
|
||||
- 3: piecewise compilation."""
|
||||
- None: If None, we will select the default compilation mode.
|
||||
For V1 engine this is 3.
|
||||
- 0: NONE: No torch.compile compilation is applied, model runs in fully
|
||||
eager pytorch mode. The model runs as-is.
|
||||
- 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
|
||||
- 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
|
||||
recompilation by removing guards.
|
||||
Requires no dynamic-shape-dependent control-flow.
|
||||
- 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
|
||||
piecewise compilation, shape specialization, and custom passes."""
|
||||
debug_dump_path: Path | None = None
|
||||
"""The path to dump the debug information."""
|
||||
cache_dir: str = ""
|
||||
@@ -195,11 +215,11 @@ class CompilationConfig:
|
||||
|
||||
backend function.
|
||||
We use string to avoid serialization issues when using compilation in a
|
||||
distributed setting. When the compilation level is 1 or 2, the backend is
|
||||
distributed setting. When the compilation mode is 1 or 2, the backend is
|
||||
used for the compilation directly (it sees the whole graph). When the
|
||||
compilation level is 3, the backend is used for the piecewise compilation
|
||||
compilation mode is 3, the backend is used for the piecewise compilation
|
||||
(it sees a part of the graph). The backend can not be custom for compilation
|
||||
level 3, i.e. the backend must be either eager or inductor. Furthermore,
|
||||
mode 3, i.e. the backend must be either eager or inductor. Furthermore,
|
||||
compilation is only piecewise if splitting ops is set accordingly and
|
||||
use_inductor_graph_partition is off. Note that the default options for
|
||||
splitting ops are sufficient for piecewise compilation.
|
||||
@@ -214,7 +234,7 @@ class CompilationConfig:
|
||||
- 'none,+op1,+op2' to enable only op1 and op2
|
||||
|
||||
By default, all custom ops are enabled when running without Inductor and
|
||||
disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
|
||||
disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
|
||||
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
||||
splitting_ops: list[str] | None = None
|
||||
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
||||
@@ -249,7 +269,7 @@ class CompilationConfig:
|
||||
One graph for symbolic shape and one graph per size in compile_sizes
|
||||
are compiled using configurations in inductor_compile_config.
|
||||
|
||||
This setting is ignored if level<PIECEWISE.
|
||||
This setting is ignored if mode<VLLM_COMPILE.
|
||||
|
||||
For future compatibility:
|
||||
If use_inductor is True, backend="inductor" otherwise backend="eager".
|
||||
@@ -299,7 +319,7 @@ class CompilationConfig:
|
||||
Currently, the cudagraph mode is only used for the v1 engine.
|
||||
Note that the cudagraph logic is generally orthogonal to the
|
||||
compilation logic. While piecewise cudagraphs require piecewise
|
||||
compilation (level=PIECEWISE and non-empty splitting_ops), full
|
||||
compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
|
||||
cudagraphs are supported with and without compilation.
|
||||
|
||||
Warning: This flag is new and subject to change in addition
|
||||
@@ -312,7 +332,7 @@ class CompilationConfig:
|
||||
that all input buffers have fixed addresses, and all
|
||||
splitting ops write their outputs to input buffers.
|
||||
In the vLLM V1 Engine, this flag only applies for
|
||||
CompilationLevel.PIECEWISE (aka -O3).
|
||||
CompilationMode.VLLM_COMPILE (aka -O3).
|
||||
Note that this is orthogonal to the cudagraph capture logic
|
||||
outside of compilation.
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
@@ -426,7 +446,7 @@ class CompilationConfig:
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
factors.append(self.level)
|
||||
factors.append(self.mode)
|
||||
factors.append(self.backend)
|
||||
factors.append(self.custom_ops)
|
||||
factors.append(self.splitting_ops)
|
||||
@@ -477,6 +497,17 @@ class CompilationConfig:
|
||||
return value
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.level is not None:
|
||||
logger.warning(
|
||||
"Level is deprecated and will be removed in the next release,"
|
||||
"either 0.12.0 or 0.11.2 whichever is soonest."
|
||||
"Use mode instead."
|
||||
"If both level and mode are given,"
|
||||
"only mode will be used."
|
||||
)
|
||||
if self.mode is None:
|
||||
self.mode = self.level
|
||||
|
||||
count_none = self.custom_ops.count("none")
|
||||
count_all = self.custom_ops.count("all")
|
||||
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
|
||||
@@ -574,7 +605,7 @@ class CompilationConfig:
|
||||
# Currently only eager and inductor backend are supported.
|
||||
# for piecewise compilation. Custom backends are not suppported for
|
||||
# piecewise compilation. Update when more backends are supported.
|
||||
if self.level == CompilationLevel.PIECEWISE and self.backend not in [
|
||||
if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
|
||||
"",
|
||||
"eager",
|
||||
"inductor",
|
||||
@@ -602,24 +633,27 @@ class CompilationConfig:
|
||||
Returns:
|
||||
The backend for the compilation config.
|
||||
"""
|
||||
if self.level is None:
|
||||
if self.mode is None:
|
||||
raise ValueError(
|
||||
"No compilation level is set. This method should only be \
|
||||
"No compilation mode is set. This method should only be \
|
||||
called via vllm config where the level is set if none is \
|
||||
provided."
|
||||
)
|
||||
if self.level == CompilationLevel.NO_COMPILATION:
|
||||
raise ValueError("No compilation level is set.")
|
||||
if self.mode == CompilationMode.NONE:
|
||||
raise ValueError("No compilation mode is set.")
|
||||
|
||||
from torch._dynamo.backends.registry import list_backends
|
||||
|
||||
torch_backends = list_backends(exclude_tags=tuple())
|
||||
if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
|
||||
if self.mode in [
|
||||
CompilationMode.STOCK_TORCH_COMPILE,
|
||||
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||
]:
|
||||
if self.backend in torch_backends:
|
||||
return self.backend
|
||||
return resolve_obj_by_qualname(self.backend)
|
||||
|
||||
assert self.level == CompilationLevel.PIECEWISE
|
||||
assert self.mode == CompilationMode.VLLM_COMPILE
|
||||
if self.backend not in ["eager", "inductor"]:
|
||||
raise ValueError(
|
||||
f"Invalid backend for piecewise compilation: {self.backend}"
|
||||
@@ -684,11 +718,11 @@ class CompilationConfig:
|
||||
self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
|
||||
|
||||
def set_splitting_ops_for_v1(self):
|
||||
# NOTE: this function needs to be called only when level is
|
||||
# CompilationLevel.PIECEWISE
|
||||
assert self.level == CompilationLevel.PIECEWISE, (
|
||||
# NOTE: this function needs to be called only when mode is
|
||||
# CompilationMode.VLLM_COMPILE
|
||||
assert self.mode == CompilationMode.VLLM_COMPILE, (
|
||||
"set_splitting_ops_for_v1 should only be called when "
|
||||
"level is CompilationLevel.PIECEWISE"
|
||||
"mode is CompilationMode.VLLM_COMPILE"
|
||||
)
|
||||
|
||||
if self.use_inductor_graph_partition:
|
||||
@@ -769,12 +803,10 @@ class CompilationConfig:
|
||||
|
||||
if not self.use_inductor_graph_partition:
|
||||
# Dynamo-level FX split case
|
||||
return self.level == CompilationLevel.PIECEWISE
|
||||
return self.mode == CompilationMode.VLLM_COMPILE
|
||||
|
||||
# Inductor partition case
|
||||
return (
|
||||
self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
|
||||
)
|
||||
return self.backend == "inductor" and self.mode > CompilationMode.NONE
|
||||
|
||||
def custom_op_log_check(self):
|
||||
"""
|
||||
|
||||
@@ -22,7 +22,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
from .cache import CacheConfig
|
||||
from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode
|
||||
from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||
from .device import DeviceConfig
|
||||
from .kv_events import KVEventsConfig
|
||||
from .kv_transfer import KVTransferConfig
|
||||
@@ -84,17 +84,11 @@ class VllmConfig:
|
||||
compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
|
||||
"""`torch.compile` and cudagraph capture configuration for the model.
|
||||
|
||||
As a shorthand, `-O<n>` can be used to directly specify the compilation
|
||||
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
|
||||
Currently, -O <n> and -O=<n> are supported as well but this will likely be
|
||||
removed in favor of clearer -O<n> syntax in the future.
|
||||
|
||||
NOTE: level 0 is the default level without any optimization. level 1 and 2
|
||||
are for internal testing only. level 3 is the recommended level for
|
||||
production, also default in V1.
|
||||
As a shorthand, one can append compilation arguments via
|
||||
-0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
|
||||
|
||||
You can specify the full compilation config like so:
|
||||
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
"""
|
||||
kv_transfer_config: KVTransferConfig | None = None
|
||||
"""The configurations for distributed KV cache transfer."""
|
||||
@@ -305,33 +299,33 @@ class VllmConfig:
|
||||
"precision for chunked prefill triton kernels."
|
||||
)
|
||||
|
||||
# If the user does not explicitly set a compilation level, then
|
||||
# we use the default level. The default level depends on other
|
||||
# If the user does not explicitly set a compilation mode, then
|
||||
# we use the default mode. The default mode depends on other
|
||||
# settings (see the below code).
|
||||
if self.compilation_config.level is None:
|
||||
if self.compilation_config.mode is None:
|
||||
if envs.VLLM_USE_V1:
|
||||
if (
|
||||
self.model_config is not None
|
||||
and not self.model_config.enforce_eager
|
||||
):
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
else:
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
self.compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
else:
|
||||
# NB: Passing both --enforce-eager and a compilation level
|
||||
# in V0 means the compilation level wins out.
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
# NB: Passing both --enforce-eager and a compilation mode
|
||||
# in V0 means the compilation mode wins out.
|
||||
self.compilation_config.mode = CompilationMode.NONE
|
||||
else:
|
||||
assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION
|
||||
assert self.compilation_config.level <= CompilationLevel.PIECEWISE
|
||||
assert self.compilation_config.mode >= CompilationMode.NONE
|
||||
assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE
|
||||
|
||||
# If user does not set custom ops via none or all set it here based on
|
||||
# compilation level and backend.
|
||||
# compilation mode and backend.
|
||||
if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
|
||||
if (
|
||||
self.compilation_config.backend == "inductor"
|
||||
and self.compilation_config.level > CompilationLevel.NO_COMPILATION
|
||||
and self.compilation_config.mode > CompilationMode.NONE
|
||||
):
|
||||
self.compilation_config.custom_ops.append("none")
|
||||
else:
|
||||
@@ -350,7 +344,7 @@ class VllmConfig:
|
||||
if self.compilation_config.cudagraph_mode is None:
|
||||
if (
|
||||
envs.VLLM_USE_V1
|
||||
and self.compilation_config.level == CompilationLevel.PIECEWISE
|
||||
and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
# default to full and piecewise for most models
|
||||
self.compilation_config.cudagraph_mode = (
|
||||
@@ -486,10 +480,10 @@ class VllmConfig:
|
||||
)
|
||||
current_platform.check_and_update_config(self)
|
||||
|
||||
# Do this after all the updates to compilation_config.level
|
||||
# Do this after all the updates to compilation_config.mode
|
||||
if (
|
||||
envs.VLLM_USE_V1
|
||||
and self.compilation_config.level == CompilationLevel.PIECEWISE
|
||||
and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
@@ -508,8 +502,8 @@ class VllmConfig:
|
||||
)
|
||||
|
||||
if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
|
||||
assert self.compilation_config.level == CompilationLevel.PIECEWISE, (
|
||||
"Compilation level should be CompilationLevel.PIECEWISE "
|
||||
assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
|
||||
"Compilation mode should be CompilationMode.VLLM_COMPILE "
|
||||
"when cudagraph_mode piecewise cudagraphs is used, "
|
||||
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
||||
)
|
||||
@@ -837,7 +831,7 @@ def set_current_vllm_config(
|
||||
|
||||
if (
|
||||
check_compile
|
||||
and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
|
||||
and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||
and compilation_counter.num_models_seen == num_models_seen
|
||||
):
|
||||
# If the model supports compilation,
|
||||
|
||||
Reference in New Issue
Block a user