2025-08-09 00:34:25 +01:00
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
import enum
|
2025-08-09 00:34:25 +01:00
|
|
|
|
from collections import Counter
|
|
|
|
|
|
from collections.abc import Callable
|
2025-12-04 04:15:04 -05:00
|
|
|
|
from dataclasses import field
|
2025-09-28 00:09:00 +08:00
|
|
|
|
from pathlib import Path
|
2025-11-03 08:13:51 -08:00
|
|
|
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2026-02-03 17:40:59 +00:00
|
|
|
|
from pydantic import Field, TypeAdapter, field_validator
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2025-11-03 08:13:51 -08:00
|
|
|
|
import vllm.envs as envs
|
2026-02-06 07:19:49 -05:00
|
|
|
|
from vllm.compilation.passes.inductor_pass import CallableInductorPass, InductorPass
|
2025-12-05 19:17:32 +01:00
|
|
|
|
from vllm.config.utils import (
|
|
|
|
|
|
Range,
|
|
|
|
|
|
config,
|
|
|
|
|
|
get_hash_factors,
|
|
|
|
|
|
hash_factors,
|
|
|
|
|
|
)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
from vllm.logger import init_logger
|
2025-10-13 18:47:16 -04:00
|
|
|
|
from vllm.platforms import current_platform
|
2025-10-17 08:48:59 +08:00
|
|
|
|
from vllm.utils.import_utils import resolve_obj_by_qualname
|
2025-11-17 09:41:22 -05:00
|
|
|
|
from vllm.utils.math_utils import round_up
|
2025-10-19 00:48:22 +08:00
|
|
|
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
2025-08-09 16:33:46 +01:00
|
|
|
|
from vllm.config import VllmConfig
|
2025-08-09 00:34:25 +01:00
|
|
|
|
else:
|
|
|
|
|
|
VllmConfig = object
|
|
|
|
|
|
|
|
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-11-11 16:46:18 -08:00
|
|
|
|
class CompilationMode(enum.IntEnum):
|
2025-10-14 22:51:16 -04:00
|
|
|
|
"""The compilation approach used for torch.compile-based compilation of the
|
|
|
|
|
|
model."""
|
|
|
|
|
|
|
|
|
|
|
|
NONE = 0
|
|
|
|
|
|
"""No torch.compile compilation is applied, model runs in fully eager pytorch mode.
|
|
|
|
|
|
The model runs as-is."""
|
|
|
|
|
|
STOCK_TORCH_COMPILE = 1
|
|
|
|
|
|
"""The standard `torch.compile` compilation pipeline."""
|
|
|
|
|
|
DYNAMO_TRACE_ONCE = 2
|
|
|
|
|
|
"""Single Dynamo trace through the model, avoiding recompilation."""
|
|
|
|
|
|
VLLM_COMPILE = 3
|
|
|
|
|
|
"""Custom vLLM Inductor-based backend with caching, piecewise compilation,
|
|
|
|
|
|
shape specialization, and custom passes."""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
class CUDAGraphMode(enum.Enum):
|
|
|
|
|
|
"""Constants for the cudagraph mode in CompilationConfig.
|
|
|
|
|
|
Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
|
|
|
|
|
|
treated as concrete runtime mode for cudagraph runtime dispatching.
|
|
|
|
|
|
"""
|
2025-10-05 15:06:22 +01:00
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
NONE = 0
|
|
|
|
|
|
PIECEWISE = 1
|
|
|
|
|
|
FULL = 2
|
|
|
|
|
|
FULL_DECODE_ONLY = (FULL, NONE)
|
|
|
|
|
|
FULL_AND_PIECEWISE = (FULL, PIECEWISE)
|
|
|
|
|
|
|
|
|
|
|
|
def decode_mode(self) -> "CUDAGraphMode":
|
|
|
|
|
|
return CUDAGraphMode(self.value[0]) if self.separate_routine() else self
|
|
|
|
|
|
|
|
|
|
|
|
def mixed_mode(self) -> "CUDAGraphMode":
|
|
|
|
|
|
return CUDAGraphMode(self.value[1]) if self.separate_routine() else self
|
|
|
|
|
|
|
2025-10-10 01:20:31 -04:00
|
|
|
|
def has_mode(self, mode: "CUDAGraphMode") -> bool:
|
|
|
|
|
|
assert not mode.separate_routine()
|
|
|
|
|
|
if self.separate_routine():
|
|
|
|
|
|
return mode.value in self.value
|
|
|
|
|
|
return self == mode
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
def requires_piecewise_compilation(self) -> bool:
|
2025-10-10 01:20:31 -04:00
|
|
|
|
return self.has_mode(CUDAGraphMode.PIECEWISE)
|
2025-08-15 22:01:39 +08:00
|
|
|
|
|
|
|
|
|
|
def max_cudagraph_mode(self) -> "CUDAGraphMode":
|
|
|
|
|
|
return CUDAGraphMode(max(self.value)) if self.separate_routine() else self
|
|
|
|
|
|
|
|
|
|
|
|
def has_full_cudagraphs(self) -> bool:
|
|
|
|
|
|
return self.max_cudagraph_mode() == CUDAGraphMode.FULL
|
|
|
|
|
|
|
2025-09-27 03:58:19 +08:00
|
|
|
|
def has_piecewise_cudagraphs(self) -> bool:
|
|
|
|
|
|
return self.requires_piecewise_compilation()
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
def separate_routine(self) -> bool:
|
|
|
|
|
|
return isinstance(self.value, tuple)
|
|
|
|
|
|
|
2026-02-27 15:14:31 -05:00
|
|
|
|
@classmethod
|
|
|
|
|
|
def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
|
|
|
|
|
|
return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
|
|
|
|
|
|
|
|
|
|
|
|
def is_valid_runtime_mode(self) -> bool:
|
|
|
|
|
|
return self in CUDAGraphMode.valid_runtime_modes()
|
2025-09-27 03:58:19 +08:00
|
|
|
|
|
2025-09-29 21:54:52 +08:00
|
|
|
|
def __str__(self) -> str:
|
|
|
|
|
|
return self.name
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
@config
|
|
|
|
|
|
class PassConfig:
|
|
|
|
|
|
"""Configuration for custom Inductor passes.
|
|
|
|
|
|
|
|
|
|
|
|
This is separate from general `CompilationConfig` so that inductor passes
|
|
|
|
|
|
don't all have access to full configuration - that would create a cycle as
|
2025-11-27 04:55:58 -05:00
|
|
|
|
the `PassManager` is set as a property of config.
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2025-11-27 04:55:58 -05:00
|
|
|
|
You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
|
|
|
|
|
|
constructor. VLLMConfig's post_init does further initialization.
|
|
|
|
|
|
If used outside of the VLLMConfig, some fields may be left in an
|
|
|
|
|
|
improper state.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-12-02 22:38:55 -05:00
|
|
|
|
# New flags
|
|
|
|
|
|
fuse_norm_quant: bool = Field(default=None)
|
|
|
|
|
|
"""Fuse the custom RMSNorm + quant ops."""
|
|
|
|
|
|
fuse_act_quant: bool = Field(default=None)
|
|
|
|
|
|
"""Fuse the custom SiluMul + quant ops."""
|
|
|
|
|
|
fuse_attn_quant: bool = Field(default=None)
|
|
|
|
|
|
"""Fuse the custom attention + quant ops."""
|
2026-02-11 03:30:00 -05:00
|
|
|
|
eliminate_noops: bool = Field(default=True)
|
2025-12-02 22:38:55 -05:00
|
|
|
|
"""Eliminate no-op ops."""
|
|
|
|
|
|
enable_sp: bool = Field(default=None)
|
2026-02-25 21:00:12 -08:00
|
|
|
|
"""Enable sequence parallelism. Requires TP>1. Automatically disabled
|
|
|
|
|
|
if the model's hidden_size is too small for SP to be beneficial
|
|
|
|
|
|
(threshold is device-capability dependent)."""
|
2025-12-02 22:38:55 -05:00
|
|
|
|
fuse_gemm_comms: bool = Field(default=None)
|
|
|
|
|
|
"""Enable async TP."""
|
|
|
|
|
|
fuse_allreduce_rms: bool = Field(default=None)
|
|
|
|
|
|
"""Enable flashinfer allreduce fusion."""
|
2026-02-24 22:36:40 -06:00
|
|
|
|
enable_qk_norm_rope_fusion: bool = False
|
|
|
|
|
|
"""Enable fused Q/K RMSNorm + RoPE pass."""
|
2025-12-02 22:38:55 -05:00
|
|
|
|
|
2026-01-28 14:47:47 -06:00
|
|
|
|
# ROCm/AITER specific fusions
|
|
|
|
|
|
fuse_act_padding: bool = Field(default=None)
|
|
|
|
|
|
"""Fuse the custom RMSNorm + padding ops."""
|
2026-02-23 21:06:00 -06:00
|
|
|
|
fuse_rope_kvcache: bool = Field(default=None)
|
|
|
|
|
|
"""Fuse the QK rope + KV cache ops."""
|
|
|
|
|
|
|
|
|
|
|
|
rope_kvcache_fusion_max_token_num: int = 256
|
|
|
|
|
|
"""The threshold for ROCm AITER RoPE+KVCache fusion e.g. for small batch decode.
|
|
|
|
|
|
Larger batch sizes e.g. during prefill will use the unfused kernels.
|
|
|
|
|
|
"""
|
2026-01-28 14:47:47 -06:00
|
|
|
|
|
2025-11-11 00:33:11 +01:00
|
|
|
|
fi_allreduce_fusion_max_size_mb: float | None = None
|
|
|
|
|
|
"""The threshold of the communicated tensor sizes under which
|
|
|
|
|
|
vllm should use flashinfer fused allreduce. Specified as a
|
|
|
|
|
|
float in MB.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
Unspecified will fallback to default values
|
2025-11-11 00:33:11 +01:00
|
|
|
|
which are compute capability and world size dependent.
|
|
|
|
|
|
FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
|
|
|
|
|
|
90: {
|
|
|
|
|
|
2: 64, # 64MB
|
|
|
|
|
|
4: 2, # 2MB
|
|
|
|
|
|
8: 1, # 1MB
|
|
|
|
|
|
},
|
|
|
|
|
|
100: {
|
|
|
|
|
|
2: 64, # 64MB
|
|
|
|
|
|
4: 32, # 32MB
|
|
|
|
|
|
8: 1, # 1MB
|
|
|
|
|
|
},
|
|
|
|
|
|
}, where key is the device capability"""
|
2026-02-25 21:00:12 -08:00
|
|
|
|
sp_min_token_num: int | None = None
|
|
|
|
|
|
"""The minimum number of tokens above which vllm should use
|
|
|
|
|
|
sequence parallelism. Specified as an integer token count.
|
|
|
|
|
|
Unspecified will fallback to default values which are compute
|
|
|
|
|
|
capability and world size dependent."""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
# TODO(luka) better pass enabling system.
|
|
|
|
|
|
|
2025-11-11 00:33:11 +01:00
|
|
|
|
def flashinfer_max_size(self, world_size: int) -> int | None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Returns the max communication size in bytes for flashinfer
|
|
|
|
|
|
allreduce fusion for the given world size. Returns None if world size
|
|
|
|
|
|
is not supported by configs as it's not supported by flashinfer.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
MiB = 1024 * 1024
|
2025-12-05 19:17:32 +01:00
|
|
|
|
FI_SUPPORTED_WORLD_SIZES = [2, 4, 8]
|
|
|
|
|
|
if world_size not in FI_SUPPORTED_WORLD_SIZES:
|
|
|
|
|
|
return None
|
2025-11-11 00:33:11 +01:00
|
|
|
|
max_size_mb = self.fi_allreduce_fusion_max_size_mb
|
|
|
|
|
|
if max_size_mb is None:
|
|
|
|
|
|
max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
|
|
|
|
|
|
|
|
|
|
|
|
return int(max_size_mb * MiB) if max_size_mb is not None else None
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
|
2026-02-06 07:19:49 -05:00
|
|
|
|
from vllm.compilation.passes.fusion.allreduce_rms_fusion import (
|
|
|
|
|
|
FI_ALLREDUCE_FUSION_MAX_SIZE_MB,
|
|
|
|
|
|
)
|
2025-11-11 00:33:11 +01:00
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
|
|
|
|
|
|
|
|
if not current_platform.is_cuda():
|
|
|
|
|
|
return {}
|
|
|
|
|
|
return FI_ALLREDUCE_FUSION_MAX_SIZE_MB.get(
|
|
|
|
|
|
current_platform.get_device_capability().to_int(), {}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-11-19 06:13:54 -08:00
|
|
|
|
def compute_hash(self) -> str:
|
2025-08-09 00:34:25 +01:00
|
|
|
|
"""
|
|
|
|
|
|
Produces a hash unique to the pass configuration.
|
|
|
|
|
|
Any new fields that affect compilation should be added to the hash.
|
|
|
|
|
|
Any future fields that don't affect compilation should be excluded.
|
|
|
|
|
|
"""
|
2025-12-04 04:15:04 -05:00
|
|
|
|
|
2025-12-11 11:59:35 +08:00
|
|
|
|
return hash_factors(get_hash_factors(self, set()))
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2025-11-27 04:55:58 -05:00
|
|
|
|
@field_validator(
|
2025-12-02 22:38:55 -05:00
|
|
|
|
"fuse_norm_quant",
|
|
|
|
|
|
"fuse_act_quant",
|
|
|
|
|
|
"fuse_attn_quant",
|
|
|
|
|
|
"enable_sp",
|
|
|
|
|
|
"fuse_gemm_comms",
|
|
|
|
|
|
"fuse_allreduce_rms",
|
2026-01-28 14:47:47 -06:00
|
|
|
|
"fuse_act_padding",
|
2026-02-23 21:06:00 -06:00
|
|
|
|
"fuse_rope_kvcache",
|
2025-11-27 04:55:58 -05:00
|
|
|
|
mode="wrap",
|
|
|
|
|
|
)
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
|
|
|
|
|
"""Skip validation if the value is `None` when initialisation is delayed."""
|
|
|
|
|
|
if value is None:
|
|
|
|
|
|
return value
|
|
|
|
|
|
return handler(value)
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
def __post_init__(self) -> None:
|
2025-12-02 22:38:55 -05:00
|
|
|
|
# Handle deprecation and defaults
|
|
|
|
|
|
|
|
|
|
|
|
if not self.eliminate_noops:
|
|
|
|
|
|
if self.fuse_norm_quant or self.fuse_act_quant:
|
2025-08-09 00:34:25 +01:00
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Fusion enabled but reshape elimination disabled. "
|
|
|
|
|
|
"RMSNorm/SiluMul + quant (fp8) fusion might not work"
|
|
|
|
|
|
)
|
2025-12-02 22:38:55 -05:00
|
|
|
|
if self.fuse_attn_quant:
|
2025-08-09 00:34:25 +01:00
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Fusion enabled but reshape elimination disabled. "
|
|
|
|
|
|
"Attention + quant (fp8) fusion might not work"
|
|
|
|
|
|
)
|
2025-12-02 22:38:55 -05:00
|
|
|
|
if self.fuse_allreduce_rms:
|
2025-11-11 00:33:11 +01:00
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Fusion enabled but reshape elimination disabled. "
|
|
|
|
|
|
"Allreduce + rms norm + quant (fp8) fusion might not work"
|
|
|
|
|
|
)
|
2026-01-28 14:47:47 -06:00
|
|
|
|
if self.fuse_act_padding:
|
|
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Fusion enabled but reshape elimination disabled. "
|
|
|
|
|
|
"RMSNorm + padding fusion might not work"
|
|
|
|
|
|
)
|
2025-11-12 05:01:14 -08:00
|
|
|
|
if self.enable_qk_norm_rope_fusion and not current_platform.is_cuda_alike():
|
2025-11-12 01:00:31 +08:00
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"QK Norm + RoPE fusion enabled but the current platform is not "
|
2025-11-12 05:01:14 -08:00
|
|
|
|
"CUDA or ROCm. The fusion will be disabled."
|
2025-11-12 01:00:31 +08:00
|
|
|
|
)
|
|
|
|
|
|
self.enable_qk_norm_rope_fusion = False
|
2026-01-28 14:47:47 -06:00
|
|
|
|
if self.fuse_act_padding and not current_platform.is_rocm():
|
|
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Padding fusion enabled but the current platform is not ROCm. "
|
|
|
|
|
|
"The fusion will be disabled."
|
|
|
|
|
|
)
|
|
|
|
|
|
self.fuse_act_padding = False
|
2026-02-23 21:06:00 -06:00
|
|
|
|
if self.fuse_rope_kvcache and not current_platform.is_rocm():
|
|
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"KV cache fusion currently only enabled on ROCm. "
|
|
|
|
|
|
"The fusion will be disabled."
|
|
|
|
|
|
)
|
|
|
|
|
|
self.fuse_rope_kvcache = False
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
|
2025-11-24 07:12:41 -08:00
|
|
|
|
class DynamicShapesType(str, enum.Enum):
|
|
|
|
|
|
"""Types of dynamic shapes handling in torch.compile().
|
|
|
|
|
|
see Dynamic shapes and vllm guard dropping in torch_compile.md
|
|
|
|
|
|
for more details."""
|
|
|
|
|
|
|
|
|
|
|
|
BACKED = "backed"
|
|
|
|
|
|
"""Use backed dynamic shapes. torch.compile() guards on backed dynamic
|
|
|
|
|
|
shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
|
|
|
|
|
|
without encountering branching on those ranges."""
|
|
|
|
|
|
|
|
|
|
|
|
UNBACKED = "unbacked"
|
|
|
|
|
|
"""Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
|
|
|
|
|
|
0/1 specialized, but may throw data dependent errors when branches require
|
|
|
|
|
|
their value without explicit unbacked handling."""
|
|
|
|
|
|
|
|
|
|
|
|
BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
|
|
|
|
|
|
"""Experimental flag that treats backed symbols as unbacked when explicit
|
|
|
|
|
|
unbacked handling is defined."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@config
|
|
|
|
|
|
class DynamicShapesConfig:
|
|
|
|
|
|
"""Configuration to control/debug torch compile dynamic shapes."""
|
|
|
|
|
|
|
|
|
|
|
|
type: DynamicShapesType = DynamicShapesType.BACKED
|
|
|
|
|
|
"""Controls the type of dynamic shapes handling to use with torch.compile().
|
|
|
|
|
|
|
|
|
|
|
|
- BACKED: Default PyTorch behavior with potential guards ignored.
|
|
|
|
|
|
- UNBACKED: No guards guaranteed (most sound) but may throw
|
|
|
|
|
|
data dependent errors.
|
|
|
|
|
|
- BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
|
|
|
|
|
|
backed/unbacked.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-12-08 07:46:15 -08:00
|
|
|
|
evaluate_guards: bool = False
|
|
|
|
|
|
"""
|
|
|
|
|
|
A debug mode to detect and fail if Dynamo ever specializes a dynamic shape by
|
|
|
|
|
|
guarding on it. When True, dynamic shape guards are not dropped from dynamo.
|
|
|
|
|
|
And a failure will be triggered if a recompilation ever happens due to that.
|
|
|
|
|
|
This mode requires VLLM_USE_BYTECODE_HOOK to be 0.
|
|
|
|
|
|
Enabling this allow observing the dynamic shapes guards in the tlparse
|
|
|
|
|
|
artifacts also.
|
|
|
|
|
|
When type is backed, aot_compile must be disabled for this mode to work.
|
|
|
|
|
|
until this change picked up https://github.com/pytorch/pytorch/pull/169239.
|
2026-01-10 23:15:46 -08:00
|
|
|
|
"""
|
2025-12-08 07:46:15 -08:00
|
|
|
|
|
2026-01-26 23:25:02 -05:00
|
|
|
|
assume_32_bit_indexing: bool = False
|
2026-01-10 23:15:46 -08:00
|
|
|
|
"""
|
|
|
|
|
|
whether all tensor sizes can use 32 bit indexing.
|
2026-01-26 23:25:02 -05:00
|
|
|
|
`True` requires PyTorch 2.10+
|
2025-12-08 07:46:15 -08:00
|
|
|
|
"""
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
|
|
|
|
|
def compute_hash(self) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Provide a hash for DynamicShapesConfig
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from vllm.config.utils import get_hash_factors, hash_factors
|
|
|
|
|
|
|
|
|
|
|
|
factors = get_hash_factors(self, {})
|
|
|
|
|
|
return hash_factors(factors)
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
@config
|
|
|
|
|
|
class CompilationConfig:
|
2025-11-27 04:55:58 -05:00
|
|
|
|
"""Configuration for compilation.
|
|
|
|
|
|
|
|
|
|
|
|
You must pass CompilationConfig to VLLMConfig constructor.
|
|
|
|
|
|
VLLMConfig's post_init does further initialization. If used outside of the
|
|
|
|
|
|
VLLMConfig, some fields will be left in an improper state.
|
|
|
|
|
|
|
|
|
|
|
|
It has three parts:
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
- Top-level Compilation control:
|
2025-10-14 22:51:16 -04:00
|
|
|
|
- [`mode`][vllm.config.CompilationConfig.mode]
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
|
|
|
|
|
|
- [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
|
|
|
|
|
|
- [`backend`][vllm.config.CompilationConfig.backend]
|
|
|
|
|
|
- [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
|
|
|
|
|
|
- [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
|
2025-11-06 16:16:03 -08:00
|
|
|
|
- [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- CudaGraph capture:
|
2025-08-15 22:01:39 +08:00
|
|
|
|
- [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- [`cudagraph_capture_sizes`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
|
2025-10-24 20:11:05 +08:00
|
|
|
|
- [`max_cudagraph_capture_size`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.max_cudagraph_capture_size]
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- [`cudagraph_num_of_warmups`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
|
|
|
|
|
|
- [`cudagraph_copy_inputs`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
|
|
|
|
|
|
- Inductor compilation:
|
|
|
|
|
|
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
|
2025-12-05 19:17:32 +01:00
|
|
|
|
- [`compile_ranges_split_points`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.compile_ranges_split_points]
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- [`inductor_compile_config`]
|
|
|
|
|
|
[vllm.config.CompilationConfig.inductor_compile_config]
|
|
|
|
|
|
- [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
|
|
|
|
|
|
- custom inductor passes
|
|
|
|
|
|
|
|
|
|
|
|
Why we have different sizes for cudagraph and inductor:
|
|
|
|
|
|
- cudagraph: a cudagraph captured for a specific size can only be used
|
|
|
|
|
|
for the same size. We need to capture all the sizes we want to use.
|
|
|
|
|
|
- inductor: a graph compiled by inductor for a general shape can be used
|
|
|
|
|
|
for different sizes. Inductor can also compile for specific sizes,
|
|
|
|
|
|
where it can have more information to optimize the graph with fully
|
|
|
|
|
|
static shapes. However, we find the general shape compilation is
|
|
|
|
|
|
sufficient for most cases. It might be beneficial to compile for
|
|
|
|
|
|
certain small batchsizes, where inductor is good at optimizing.
|
|
|
|
|
|
"""
|
2025-10-05 15:06:22 +01:00
|
|
|
|
|
2025-10-14 22:51:16 -04:00
|
|
|
|
# Top-level Compilation control
|
2025-11-27 04:55:58 -05:00
|
|
|
|
mode: CompilationMode = Field(default=None)
|
2025-10-14 22:51:16 -04:00
|
|
|
|
"""The compilation approach used for torch.compile-based compilation of the
|
|
|
|
|
|
model.
|
|
|
|
|
|
|
|
|
|
|
|
- None: If None, we will select the default compilation mode.
|
|
|
|
|
|
For V1 engine this is 3.
|
|
|
|
|
|
- 0: NONE: No torch.compile compilation is applied, model runs in fully
|
|
|
|
|
|
eager pytorch mode. The model runs as-is.
|
|
|
|
|
|
- 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
|
|
|
|
|
|
- 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
|
|
|
|
|
|
recompilation by removing guards.
|
|
|
|
|
|
Requires no dynamic-shape-dependent control-flow.
|
|
|
|
|
|
- 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
|
|
|
|
|
|
piecewise compilation, shape specialization, and custom passes."""
|
2025-09-28 00:09:00 +08:00
|
|
|
|
debug_dump_path: Path | None = None
|
2025-08-09 00:34:25 +01:00
|
|
|
|
"""The path to dump the debug information."""
|
|
|
|
|
|
cache_dir: str = ""
|
|
|
|
|
|
"""The directory to store the compiled graph, to accelerate Inductor
|
|
|
|
|
|
compilation. By default, it will use model-related information to generate
|
|
|
|
|
|
a cache directory."""
|
2025-11-03 08:13:51 -08:00
|
|
|
|
compile_cache_save_format: Literal["binary", "unpacked"] = field(
|
|
|
|
|
|
default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT
|
|
|
|
|
|
)
|
|
|
|
|
|
"""Format for saving torch compile cache:\n
|
|
|
|
|
|
- "binary": saves as binary file (multiprocess safe)\n
|
|
|
|
|
|
- "unpacked": saves as directory structure for inspection/debugging
|
|
|
|
|
|
(NOT multiprocess safe)\n
|
|
|
|
|
|
Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
|
|
|
|
|
|
"""
|
2025-10-09 20:43:55 +08:00
|
|
|
|
backend: str = ""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
"""The backend for compilation. It needs to be a string:
|
|
|
|
|
|
|
2025-10-13 18:47:16 -04:00
|
|
|
|
- "" (empty string): use the default backend ("inductor" on CUDA-alike
|
|
|
|
|
|
platforms).
|
2025-08-09 00:34:25 +01:00
|
|
|
|
- "eager"/"openxla"/...: use the specified backend registered in PyTorch.
|
|
|
|
|
|
- "full.module.name": a qualified name which can be used to import the
|
|
|
|
|
|
|
|
|
|
|
|
backend function.
|
|
|
|
|
|
We use string to avoid serialization issues when using compilation in a
|
2025-10-14 22:51:16 -04:00
|
|
|
|
distributed setting. When the compilation mode is 1 or 2, the backend is
|
2025-08-09 00:34:25 +01:00
|
|
|
|
used for the compilation directly (it sees the whole graph). When the
|
2025-12-08 07:46:15 -08:00
|
|
|
|
compilation mode is 3, the backend supports both whole graph and piecewise
|
|
|
|
|
|
compilation, available backends include eager, inductor, and custom backends,
|
2025-11-25 15:25:15 +08:00
|
|
|
|
the latter of which can be defined via `get_compile_backend`. Furthermore,
|
2025-10-13 18:47:16 -04:00
|
|
|
|
compilation is only piecewise if splitting ops is set accordingly and
|
2025-10-14 19:55:02 -04:00
|
|
|
|
use_inductor_graph_partition is off. Note that the default options for
|
2025-10-13 18:47:16 -04:00
|
|
|
|
splitting ops are sufficient for piecewise compilation.
|
|
|
|
|
|
"""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
custom_ops: list[str] = field(default_factory=list)
|
|
|
|
|
|
"""Fine-grained control over which custom ops to enable/disable. Use 'all'
|
|
|
|
|
|
to enable all, 'none' to disable all. Also specify a list of custom op
|
|
|
|
|
|
names to enable (prefixed with a '+'), or disable (prefixed with a '-').
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
|
|
|
|
- 'all,-op1' to enable all except op1
|
|
|
|
|
|
- 'none,+op1,+op2' to enable only op1 and op2
|
|
|
|
|
|
|
|
|
|
|
|
By default, all custom ops are enabled when running without Inductor and
|
2026-01-10 00:21:11 +08:00
|
|
|
|
disabled when running with Inductor: mode>CompilationMode.NONE and
|
|
|
|
|
|
backend="inductor".
|
2025-08-09 00:34:25 +01:00
|
|
|
|
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
2025-08-15 22:01:39 +08:00
|
|
|
|
splitting_ops: list[str] | None = None
|
2025-10-10 12:35:28 -04:00
|
|
|
|
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
|
|
|
|
|
|
|
|
|
|
|
The behavior depends on use_inductor_graph_partition:
|
|
|
|
|
|
|
|
|
|
|
|
- When use_inductor_graph_partition=False (default):
|
|
|
|
|
|
These ops are used for Dynamo FX-level graph splitting. The graph is
|
|
|
|
|
|
split at these ops before Inductor compilation, creating separate
|
|
|
|
|
|
subgraphs for cudagraph capture.
|
|
|
|
|
|
|
|
|
|
|
|
- When use_inductor_graph_partition=True:
|
|
|
|
|
|
These ops are used to register Inductor partition rules. The graph
|
|
|
|
|
|
partitioning happens at Inductor codegen time after all passes and
|
|
|
|
|
|
fusions are finished, allowing compilation and custom passes to operate
|
|
|
|
|
|
on the full graph while still excluding these ops from cudagraphs.
|
|
|
|
|
|
|
|
|
|
|
|
If None, defaults to attention ops for piecewise cudagraphs.
|
|
|
|
|
|
If empty list [], no ops are excluded (suitable for full cudagraphs)."""
|
2025-11-13 08:38:08 -08:00
|
|
|
|
compile_mm_encoder: bool = False
|
2025-11-06 20:23:17 -08:00
|
|
|
|
"""Whether or not to compile the multimodal encoder.
|
2026-01-09 19:01:38 -08:00
|
|
|
|
Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models
|
|
|
|
|
|
on selected platforms. Disabled by default until more models
|
|
|
|
|
|
are supported/tested to work."""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
# Inductor capture
|
|
|
|
|
|
compile_sizes: list[int | str] | None = None
|
|
|
|
|
|
"""Sizes to compile for inductor. In addition
|
|
|
|
|
|
to integers, it also supports "cudagraph_capture_sizes" to
|
|
|
|
|
|
specify the sizes for cudagraph capture."""
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
2025-12-05 19:17:32 +01:00
|
|
|
|
compile_ranges_split_points: list[int] | None = None
|
|
|
|
|
|
"""Split points that represent compile ranges for inductor.
|
2025-12-17 23:22:23 -05:00
|
|
|
|
The compile ranges are
|
|
|
|
|
|
[1, split_points[0]],
|
|
|
|
|
|
[split_points[0] + 1, split_points[1]], ...,
|
2025-12-05 19:17:32 +01:00
|
|
|
|
[split_points[-1] + 1, max_num_batched_tokens].
|
|
|
|
|
|
Compile sizes are also used single element ranges,
|
|
|
|
|
|
the range is represented as [compile_sizes[i], compile_sizes[i]].
|
2025-12-17 23:22:23 -05:00
|
|
|
|
|
|
|
|
|
|
If a range overlaps with the compile size, graph for compile size
|
2025-12-05 19:17:32 +01:00
|
|
|
|
will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
|
|
|
|
|
|
graph for compile size 4 will be compiled and used instead of the graph
|
|
|
|
|
|
for range [1, 8].
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
inductor_compile_config: dict = field(default_factory=dict)
|
|
|
|
|
|
"""Additional configurations for inductor.
|
|
|
|
|
|
- None: use default configurations."""
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
inductor_passes: dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
"""Additional passes for inductor. It is a dictionary
|
|
|
|
|
|
from pass name to pass function qualified name. We use function
|
|
|
|
|
|
name because the config uses JSON format. If we pass the config
|
|
|
|
|
|
from Python, functions can also be passed directly via Python object
|
|
|
|
|
|
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
|
|
|
|
|
|
|
|
|
|
|
# CudaGraph compilation
|
2025-11-27 04:55:58 -05:00
|
|
|
|
cudagraph_mode: CUDAGraphMode = Field(default=None)
|
2025-08-15 22:01:39 +08:00
|
|
|
|
"""
|
2025-08-26 11:27:20 +01:00
|
|
|
|
The mode of the cudagraph:
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
- NONE, no cudagraph capture.
|
2025-09-23 15:29:26 -04:00
|
|
|
|
- PIECEWISE.
|
2025-08-15 22:01:39 +08:00
|
|
|
|
- FULL.
|
|
|
|
|
|
- FULL_DECODE_ONLY.
|
2025-09-23 15:29:26 -04:00
|
|
|
|
- FULL_AND_PIECEWISE. (v1 default)
|
2025-08-15 22:01:39 +08:00
|
|
|
|
|
|
|
|
|
|
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
|
2025-09-03 11:44:50 +08:00
|
|
|
|
incompatible ops (i.e. some attention ops) outside the cudagraph
|
2025-08-15 22:01:39 +08:00
|
|
|
|
for general flexibility.
|
|
|
|
|
|
|
|
|
|
|
|
FULL mode: Capture full cudagraph for all batches. Can be good for small
|
|
|
|
|
|
models or workloads with small prompts; not supported by many backends.
|
|
|
|
|
|
Generally for performance FULL_AND_PIECEWISE is better.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
|
|
|
|
|
|
Mixed prefill-decode batches are run without cudagraphs. Can be good for
|
|
|
|
|
|
decode instances in a P/D setup where prefill is not as important so we
|
|
|
|
|
|
can save some memory.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
|
|
|
|
|
|
piecewise cudagraph for prefill and mixed prefill-decode batches.
|
2025-09-23 15:29:26 -04:00
|
|
|
|
This is the most performant mode for most models and is the default.
|
2025-08-15 22:01:39 +08:00
|
|
|
|
|
|
|
|
|
|
Currently, the cudagraph mode is only used for the v1 engine.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
Note that the cudagraph logic is generally orthogonal to the
|
|
|
|
|
|
compilation logic. While piecewise cudagraphs require piecewise
|
2025-10-14 22:51:16 -04:00
|
|
|
|
compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
|
2025-08-15 22:01:39 +08:00
|
|
|
|
cudagraphs are supported with and without compilation.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
|
|
|
|
|
|
Warning: This flag is new and subject to change in addition
|
2025-08-15 22:01:39 +08:00
|
|
|
|
more modes may be added.
|
|
|
|
|
|
"""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
cudagraph_num_of_warmups: int = 0
|
|
|
|
|
|
"""Number of warmup runs for cudagraph.
|
|
|
|
|
|
It means the first several runs will be treated as warmup runs.
|
|
|
|
|
|
Only after that, the execution will be recorded, and the recorded
|
|
|
|
|
|
cudagraph will be used for subsequent runs."""
|
|
|
|
|
|
cudagraph_capture_sizes: list[int] | None = None
|
|
|
|
|
|
"""Sizes to capture cudagraph.
|
|
|
|
|
|
- None (default): capture sizes are inferred from vllm config.
|
|
|
|
|
|
- list[int]: capture sizes are specified as given."""
|
|
|
|
|
|
cudagraph_copy_inputs: bool = False
|
|
|
|
|
|
"""Whether to copy input tensors for
|
|
|
|
|
|
cudagraph. If the caller can guarantee that the same input buffers
|
|
|
|
|
|
are always used, it can set this to False. Otherwise, it should
|
|
|
|
|
|
set this to True, and the compiler will copy the input to an
|
2025-11-11 16:46:18 -08:00
|
|
|
|
internally managed buffer. Default is False.
|
2025-08-15 22:01:39 +08:00
|
|
|
|
Note that this flag is only effective when cudagraph_mode is PIECEWISE.
|
|
|
|
|
|
"""
|
2025-10-20 05:21:09 +01:00
|
|
|
|
cudagraph_specialize_lora: bool = True
|
|
|
|
|
|
"""Whether to create separate cuda graphs for cases with and without active
|
|
|
|
|
|
LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
|
|
|
|
|
|
for all cases, incurring the overhead of running LoRA ops even when no
|
|
|
|
|
|
adapters are active. Setting this to True will remove this overhead at the
|
|
|
|
|
|
cost of increased startup time and slightly higher memory usage.
|
|
|
|
|
|
When `enable_lora` is False, this option has no effect.
|
|
|
|
|
|
"""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2025-11-27 04:55:58 -05:00
|
|
|
|
use_inductor_graph_partition: bool = Field(default=None)
|
2025-09-19 18:02:15 -07:00
|
|
|
|
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
|
|
|
|
|
|
This partition happens at inductor codegen time after all passes and fusions
|
|
|
|
|
|
are finished. It generates a single `call` function which wraps
|
|
|
|
|
|
cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
|
|
|
|
|
|
outside the partition functions. For a graph with N cudagraph-unsafe ops
|
|
|
|
|
|
(e.g., Attention), there would be N+1 partitions. To mark an op as
|
|
|
|
|
|
cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
|
2025-11-11 16:46:18 -08:00
|
|
|
|
register the custom op.
|
2025-09-19 18:02:15 -07:00
|
|
|
|
|
|
|
|
|
|
This config supports both full cudagraph and piecewise cudagraph without
|
|
|
|
|
|
compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
|
|
|
|
|
|
to each partition. For N+1 partitions, there would be N+1
|
|
|
|
|
|
CUDAGraph wrapper instances.
|
|
|
|
|
|
|
|
|
|
|
|
For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
|
|
|
|
|
|
inductor `call` function in the model runner. The top-level full cudagraph
|
|
|
|
|
|
capture ignores all partitioning.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
pass_config: PassConfig = field(default_factory=PassConfig)
|
|
|
|
|
|
"""Custom inductor passes, see PassConfig for more details"""
|
|
|
|
|
|
|
2026-01-29 09:12:26 +00:00
|
|
|
|
max_cudagraph_capture_size: int = field(default=None)
|
2025-10-24 20:11:05 +08:00
|
|
|
|
"""The maximum cudagraph capture size.
|
2025-11-11 16:46:18 -08:00
|
|
|
|
|
|
|
|
|
|
If cudagraph_capture_sizes is specified, this will be set to the largest
|
2025-10-24 20:11:05 +08:00
|
|
|
|
size in that list (or checked for consistency if specified). If
|
|
|
|
|
|
cudagraph_capture_sizes is not specified, the list of sizes is generated
|
|
|
|
|
|
automatically following the pattern:
|
|
|
|
|
|
|
|
|
|
|
|
[1, 2, 4] + list(range(8, 256, 8)) + list(
|
|
|
|
|
|
range(256, max_cudagraph_capture_size + 1, 16))
|
|
|
|
|
|
|
|
|
|
|
|
If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
|
2025-11-11 16:46:18 -08:00
|
|
|
|
512) by default. This voids OOM in tight memory scenarios with small
|
2025-10-24 20:11:05 +08:00
|
|
|
|
max_num_seqs, and prevents capture of many large graphs (>512) that would
|
|
|
|
|
|
greatly increase startup time with limited performance benefit.
|
|
|
|
|
|
"""
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
|
|
|
|
|
dynamic_shapes_config: DynamicShapesConfig = field(
|
|
|
|
|
|
default_factory=DynamicShapesConfig
|
|
|
|
|
|
)
|
|
|
|
|
|
"""Configuration for dynamic shapes options"""
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
local_cache_dir: str = field(default=None, init=False) # type: ignore
|
|
|
|
|
|
"""local cache dir for each rank"""
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
2026-02-08 13:42:56 -05:00
|
|
|
|
fast_moe_cold_start: bool | None = None
|
2026-02-02 19:38:49 -08:00
|
|
|
|
"""Optimization for fast MOE cold start.
|
|
|
|
|
|
|
|
|
|
|
|
This is a bit of a hack that assumes that:
|
|
|
|
|
|
1. the only decoder forward pass being run is the current model
|
|
|
|
|
|
2. the decoder forward pass runs all of the MOEs in the order in which they
|
|
|
|
|
|
are initialized
|
|
|
|
|
|
|
|
|
|
|
|
When the above two conditions hold, this option greatly decreases cold start
|
|
|
|
|
|
time for MOE models.
|
|
|
|
|
|
|
2026-02-08 13:42:56 -05:00
|
|
|
|
The options are:
|
|
|
|
|
|
- True: optimization is always on
|
|
|
|
|
|
- False: optimization is always off
|
|
|
|
|
|
- None: optimization is on usually but off for speculative decoding
|
|
|
|
|
|
|
|
|
|
|
|
If conditions 1&2 don't hold then this option will lead to silent
|
|
|
|
|
|
incorrectness.
|
|
|
|
|
|
The only condition in which this doesn't hold is speculative
|
2026-02-02 19:38:49 -08:00
|
|
|
|
decoding, where there is a draft model that may have MOEs in them.
|
|
|
|
|
|
|
|
|
|
|
|
NB: We're working on a longer-term solution that doesn't need these assumptions.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
# keep track of enabled and disabled custom ops
|
|
|
|
|
|
enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
|
|
|
|
|
|
"""custom ops that are enabled"""
|
|
|
|
|
|
disabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False)
|
|
|
|
|
|
"""custom ops that are disabled"""
|
|
|
|
|
|
traced_files: set[str] = field(default_factory=set, init=False)
|
|
|
|
|
|
"""files that are traced for compilation"""
|
|
|
|
|
|
compilation_time: float = field(default=0.0, init=False)
|
|
|
|
|
|
"""time taken for compilation"""
|
|
|
|
|
|
|
|
|
|
|
|
static_forward_context: dict[str, Any] = field(default_factory=dict, init=False)
|
|
|
|
|
|
"""Per-model forward context
|
|
|
|
|
|
Map from layer name to layer objects that need to be accessed outside
|
|
|
|
|
|
model code, e.g., Attention, FusedMOE when dp_size>1."""
|
|
|
|
|
|
|
2026-01-27 18:17:54 -05:00
|
|
|
|
static_all_moe_layers: list[str] = field(default_factory=list, init=False)
|
|
|
|
|
|
"""The names of all the MOE layers in the model
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
# Attention ops; used for piecewise cudagraphs
|
2025-10-10 12:35:28 -04:00
|
|
|
|
# Use PyTorch operator format: "namespace::name"
|
2025-08-15 22:01:39 +08:00
|
|
|
|
_attention_ops: ClassVar[list[str]] = [
|
2025-10-10 12:35:28 -04:00
|
|
|
|
"vllm::unified_attention",
|
|
|
|
|
|
"vllm::unified_attention_with_output",
|
|
|
|
|
|
"vllm::unified_mla_attention",
|
|
|
|
|
|
"vllm::unified_mla_attention_with_output",
|
|
|
|
|
|
"vllm::mamba_mixer2",
|
|
|
|
|
|
"vllm::mamba_mixer",
|
|
|
|
|
|
"vllm::short_conv",
|
|
|
|
|
|
"vllm::linear_attention",
|
|
|
|
|
|
"vllm::plamo2_mamba_mixer",
|
2025-11-06 05:01:12 +04:00
|
|
|
|
"vllm::gdn_attention_core",
|
2026-03-05 11:51:06 -08:00
|
|
|
|
"vllm::olmo_hybrid_gdn_full_forward",
|
2025-10-30 21:02:27 +08:00
|
|
|
|
"vllm::kda_attention",
|
2025-10-10 12:35:28 -04:00
|
|
|
|
"vllm::sparse_attn_indexer",
|
2026-01-21 23:16:30 +08:00
|
|
|
|
"vllm::rocm_aiter_sparse_attn_indexer",
|
2025-08-15 22:01:39 +08:00
|
|
|
|
]
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
def compute_hash(self) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Provide a hash that uniquely identifies all the configs
|
|
|
|
|
|
that affect the structure of the computation
|
|
|
|
|
|
graph from input ids/embeddings to the final hidden states,
|
|
|
|
|
|
excluding anything before input ids/embeddings and after
|
|
|
|
|
|
the final hidden states.
|
|
|
|
|
|
"""
|
2025-11-19 06:13:54 -08:00
|
|
|
|
# Opt-out: default-include declared fields; keep a tiny exclude set;
|
|
|
|
|
|
# normalize types; keep SHA-256. For nested opaque configs, include a
|
|
|
|
|
|
# stable identifier (e.g., pass_config.compute_hash()) instead of object id.
|
|
|
|
|
|
|
|
|
|
|
|
ignored_factors = {
|
|
|
|
|
|
# Paths/dirs and runtime/metrics that don’t affect compiled graph
|
|
|
|
|
|
"debug_dump_path",
|
|
|
|
|
|
"cache_dir",
|
|
|
|
|
|
"local_cache_dir",
|
|
|
|
|
|
"traced_files",
|
|
|
|
|
|
"compilation_time",
|
|
|
|
|
|
"static_forward_context",
|
|
|
|
|
|
"pass_config", # handled separately below
|
2026-01-10 23:15:46 -08:00
|
|
|
|
"dynamic_shapes_config", # handled separately below
|
2025-11-19 06:13:54 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
from vllm.config.utils import get_hash_factors, hash_factors
|
|
|
|
|
|
|
|
|
|
|
|
factors = get_hash_factors(self, ignored_factors)
|
2025-11-24 07:12:41 -08:00
|
|
|
|
|
2025-11-19 06:13:54 -08:00
|
|
|
|
factors["pass_config"] = self.pass_config.compute_hash()
|
2026-01-10 23:15:46 -08:00
|
|
|
|
factors["dynamic_shapes_config"] = self.dynamic_shapes_config.compute_hash()
|
2025-11-19 06:13:54 -08:00
|
|
|
|
return hash_factors(factors)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
def __repr__(self) -> str:
|
|
|
|
|
|
exclude = {
|
|
|
|
|
|
"static_forward_context": True,
|
|
|
|
|
|
"enabled_custom_ops": True,
|
|
|
|
|
|
"disabled_custom_ops": True,
|
|
|
|
|
|
"compilation_time": True,
|
|
|
|
|
|
"traced_files": True,
|
|
|
|
|
|
"inductor_compile_config": {
|
|
|
|
|
|
"post_grad_custom_post_pass": True,
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# exclude default attr in pass_config
|
|
|
|
|
|
pass_config_exclude = {}
|
|
|
|
|
|
for attr, default_val in vars(PassConfig()).items():
|
|
|
|
|
|
if getattr(self.pass_config, attr) == default_val:
|
|
|
|
|
|
pass_config_exclude[attr] = True
|
|
|
|
|
|
if pass_config_exclude:
|
|
|
|
|
|
exclude["pass_config"] = pass_config_exclude
|
|
|
|
|
|
|
2025-09-29 21:54:52 +08:00
|
|
|
|
config = TypeAdapter(CompilationConfig).dump_python(
|
|
|
|
|
|
self, exclude=exclude, exclude_unset=True
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return str(config)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
__str__ = __repr__
|
|
|
|
|
|
|
2025-11-11 16:46:18 -08:00
|
|
|
|
@field_validator("mode", mode="before")
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def validate_mode_before(cls, value: Any) -> Any:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Enable parsing the `mode` field from string mode names.
|
|
|
|
|
|
Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
|
|
|
|
|
|
DYNAMO_TRACE_ONCE, VLLM_COMPILE.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
# Convert string mode name to integer value
|
|
|
|
|
|
mode_name = value.upper()
|
|
|
|
|
|
|
|
|
|
|
|
if mode_name not in CompilationMode.__members__:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Invalid compilation mode: {value}. "
|
|
|
|
|
|
f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return CompilationMode[mode_name]
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
@field_validator("cudagraph_mode", mode="before")
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def validate_cudagraph_mode_before(cls, value: Any) -> Any:
|
2025-11-12 16:10:28 +00:00
|
|
|
|
"""Enable parsing of the `cudagraph_mode` enum type from string."""
|
2025-08-15 22:01:39 +08:00
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
return CUDAGraphMode[value.upper()]
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
2025-11-12 16:10:28 +00:00
|
|
|
|
@field_validator("pass_config", mode="before")
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def validate_pass_config_before(cls, value: Any) -> Any:
|
|
|
|
|
|
"""Enable parsing of the `pass_config` field from a dictionary."""
|
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
|
return PassConfig(**value)
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
2025-11-03 08:13:51 -08:00
|
|
|
|
@field_validator("compile_cache_save_format")
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def validate_compile_cache_save_format(cls, value: str) -> str:
|
|
|
|
|
|
if value not in ("binary", "unpacked"):
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"compile_cache_save_format must be 'binary' or 'unpacked', "
|
|
|
|
|
|
f"got: {value}"
|
|
|
|
|
|
)
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
2025-11-27 04:55:58 -05:00
|
|
|
|
@field_validator(
|
|
|
|
|
|
"level",
|
|
|
|
|
|
"mode",
|
|
|
|
|
|
"cudagraph_mode",
|
2026-01-29 09:12:26 +00:00
|
|
|
|
"max_cudagraph_capture_size",
|
2025-11-27 04:55:58 -05:00
|
|
|
|
"use_inductor_graph_partition",
|
|
|
|
|
|
mode="wrap",
|
|
|
|
|
|
)
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
|
|
|
|
|
"""Skip validation if the value is `None` when initialisation is delayed."""
|
|
|
|
|
|
if value is None:
|
|
|
|
|
|
return value
|
|
|
|
|
|
return handler(value)
|
|
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
def __post_init__(self) -> None:
|
|
|
|
|
|
count_none = self.custom_ops.count("none")
|
|
|
|
|
|
count_all = self.custom_ops.count("all")
|
|
|
|
|
|
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
|
|
|
|
|
|
|
|
|
|
|
|
# TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
|
|
|
|
|
|
# 1. A bug in PyTorch, fixed in 2.7:
|
|
|
|
|
|
# https://github.com/pytorch/pytorch/issues/147924
|
|
|
|
|
|
# 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
|
|
|
|
|
|
# work with V2. Addressing this will take extra engineering effort
|
|
|
|
|
|
# and it is not yet a priority. RFC here:
|
|
|
|
|
|
# https://github.com/vllm-project/vllm/issues/14703
|
|
|
|
|
|
|
2026-01-28 13:03:56 -08:00
|
|
|
|
KEY = "enable_auto_functionalized_v2"
|
|
|
|
|
|
if KEY not in self.inductor_compile_config:
|
|
|
|
|
|
self.inductor_compile_config[KEY] = False
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
for k, v in self.inductor_passes.items():
|
|
|
|
|
|
if not isinstance(v, str):
|
|
|
|
|
|
assert callable(v), f"pass {k} should be callable or a qualified name"
|
|
|
|
|
|
self.inductor_compile_config[k] = (
|
|
|
|
|
|
v if isinstance(v, InductorPass) else CallableInductorPass(v)
|
2025-10-05 15:06:22 +01:00
|
|
|
|
)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# resolve function from qualified name
|
|
|
|
|
|
names = v.split(".")
|
|
|
|
|
|
module = ".".join(names[:-1])
|
|
|
|
|
|
func_name = names[-1]
|
|
|
|
|
|
func = __import__(module).__dict__[func_name]
|
|
|
|
|
|
self.inductor_compile_config[k] = (
|
|
|
|
|
|
func if isinstance(func, InductorPass) else CallableInductorPass(func)
|
2025-10-05 15:06:22 +01:00
|
|
|
|
)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2026-02-24 22:36:40 -06:00
|
|
|
|
if (
|
|
|
|
|
|
self.pass_config.enable_qk_norm_rope_fusion
|
|
|
|
|
|
and "+rotary_embedding" not in self.custom_ops
|
|
|
|
|
|
):
|
2025-11-12 01:00:31 +08:00
|
|
|
|
# TODO(zhuhaoran): support rope native forward match and remove this.
|
|
|
|
|
|
# Linked issue: https://github.com/vllm-project/vllm/issues/28042
|
|
|
|
|
|
self.custom_ops.append("+rotary_embedding")
|
2026-02-24 22:36:40 -06:00
|
|
|
|
if (
|
|
|
|
|
|
self.pass_config.fuse_rope_kvcache
|
|
|
|
|
|
and "+rotary_embedding" not in self.custom_ops
|
|
|
|
|
|
):
|
|
|
|
|
|
# TODO(Rohan138): support rope native forward match and remove this.
|
|
|
|
|
|
# Linked issue: https://github.com/vllm-project/vllm/issues/28042
|
|
|
|
|
|
self.custom_ops.append("+rotary_embedding")
|
2025-11-12 01:00:31 +08:00
|
|
|
|
|
2025-10-14 06:40:59 -07:00
|
|
|
|
if (
|
|
|
|
|
|
is_torch_equal_or_newer("2.9.0.dev")
|
|
|
|
|
|
and "combo_kernels" not in self.inductor_compile_config
|
|
|
|
|
|
and "benchmark_combo_kernel" not in self.inductor_compile_config
|
2025-11-20 19:50:59 -08:00
|
|
|
|
# (fixme @boyuan) combo kernel does not support cpu yet.
|
|
|
|
|
|
and not current_platform.is_cpu()
|
2025-10-14 06:40:59 -07:00
|
|
|
|
):
|
|
|
|
|
|
# use horizontal fusion, which is useful for fusing qk-norm and
|
|
|
|
|
|
# qk-rope when query and key have different shapes.
|
|
|
|
|
|
self.inductor_compile_config["combo_kernels"] = True
|
|
|
|
|
|
self.inductor_compile_config["benchmark_combo_kernel"] = True
|
|
|
|
|
|
|
2025-09-19 18:02:15 -07:00
|
|
|
|
if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
|
|
|
|
|
|
"2.9.0.dev"
|
|
|
|
|
|
):
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
"use_inductor_graph_partition is only "
|
|
|
|
|
|
"supported with torch>=2.9.0.dev. Set "
|
|
|
|
|
|
"use_inductor_graph_partition=False instead."
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-09-22 15:30:05 -04:00
|
|
|
|
for op in self.custom_ops:
|
|
|
|
|
|
if op[0] not in {"+", "-"} and op not in {"all", "none"}:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Invalid syntax '{op}' for custom op, "
|
|
|
|
|
|
"must be 'all', 'none', '+op' or '-op' "
|
|
|
|
|
|
"(where 'op' is the registered op name)"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-10-13 18:47:16 -04:00
|
|
|
|
# Currently only eager and inductor backend are supported.
|
2026-03-06 01:05:46 +08:00
|
|
|
|
# for piecewise compilation. Custom backends are not supported for
|
2025-10-13 18:47:16 -04:00
|
|
|
|
# piecewise compilation. Update when more backends are supported.
|
2025-10-14 22:51:16 -04:00
|
|
|
|
if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
|
2025-10-13 18:47:16 -04:00
|
|
|
|
"",
|
|
|
|
|
|
"eager",
|
|
|
|
|
|
"inductor",
|
|
|
|
|
|
]:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Invalid backend for piecewise compilation: {self.backend}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if self.backend == "":
|
2025-11-25 15:25:15 +08:00
|
|
|
|
self.backend = current_platform.get_compile_backend()
|
2025-10-13 18:47:16 -04:00
|
|
|
|
|
2025-08-15 22:01:39 +08:00
|
|
|
|
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
|
2025-10-13 18:47:16 -04:00
|
|
|
|
"""
|
|
|
|
|
|
Initialize the backend for the compilation config from a vllm config.
|
|
|
|
|
|
Arguments:
|
|
|
|
|
|
vllm_config: The vllm config to initialize the backend from.
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
The backend for the compilation config.
|
|
|
|
|
|
"""
|
2025-10-14 22:51:16 -04:00
|
|
|
|
if self.mode is None:
|
2025-10-13 18:47:16 -04:00
|
|
|
|
raise ValueError(
|
2026-01-06 09:51:54 +08:00
|
|
|
|
"No compilation mode is set. This method should only be "
|
|
|
|
|
|
"called via vllm config where the level is set if none is "
|
|
|
|
|
|
"provided."
|
2025-10-13 18:47:16 -04:00
|
|
|
|
)
|
2025-10-14 22:51:16 -04:00
|
|
|
|
if self.mode == CompilationMode.NONE:
|
|
|
|
|
|
raise ValueError("No compilation mode is set.")
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
from torch._dynamo.backends.registry import list_backends
|
2025-10-05 15:06:22 +01:00
|
|
|
|
|
2025-08-09 00:34:25 +01:00
|
|
|
|
torch_backends = list_backends(exclude_tags=tuple())
|
2025-10-14 22:51:16 -04:00
|
|
|
|
if self.mode in [
|
|
|
|
|
|
CompilationMode.STOCK_TORCH_COMPILE,
|
|
|
|
|
|
CompilationMode.DYNAMO_TRACE_ONCE,
|
|
|
|
|
|
]:
|
2025-08-09 00:34:25 +01:00
|
|
|
|
if self.backend in torch_backends:
|
|
|
|
|
|
return self.backend
|
|
|
|
|
|
return resolve_obj_by_qualname(self.backend)
|
|
|
|
|
|
|
2025-10-14 22:51:16 -04:00
|
|
|
|
assert self.mode == CompilationMode.VLLM_COMPILE
|
2025-10-13 18:47:16 -04:00
|
|
|
|
if self.backend not in ["eager", "inductor"]:
|
2025-11-25 15:25:15 +08:00
|
|
|
|
logger.info("Using OOT custom backend for compilation.")
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
from vllm.compilation.backends import VllmBackend
|
2025-10-05 15:06:22 +01:00
|
|
|
|
|
2025-10-28 15:36:43 -07:00
|
|
|
|
# TODO[@lucaskabela]: See if we can forward prefix
|
|
|
|
|
|
# https://github.com/vllm-project/vllm/issues/27045
|
2025-08-09 00:34:25 +01:00
|
|
|
|
return VllmBackend(vllm_config)
|
|
|
|
|
|
|
2025-10-24 20:11:05 +08:00
|
|
|
|
def post_init_cudagraph_sizes(self) -> None:
|
|
|
|
|
|
"""To complete the initialization after cudagraph related
|
|
|
|
|
|
configs are set. This includes:
|
|
|
|
|
|
- initialize compile_sizes
|
|
|
|
|
|
"""
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
|
|
|
|
|
computed_compile_sizes = []
|
|
|
|
|
|
if self.compile_sizes is not None:
|
|
|
|
|
|
# de-duplicate the sizes provided by the config
|
|
|
|
|
|
self.compile_sizes = list(set(self.compile_sizes))
|
|
|
|
|
|
for x in self.compile_sizes:
|
|
|
|
|
|
if isinstance(x, str):
|
|
|
|
|
|
assert x == "cudagraph_capture_sizes", (
|
2025-09-22 15:30:05 -04:00
|
|
|
|
"Unrecognized size type in compile_sizes, "
|
|
|
|
|
|
f"expect 'cudagraph_capture_sizes', got {x}"
|
2025-10-05 15:06:22 +01:00
|
|
|
|
)
|
2025-08-09 00:34:25 +01:00
|
|
|
|
computed_compile_sizes.extend(self.cudagraph_capture_sizes)
|
|
|
|
|
|
else:
|
|
|
|
|
|
assert isinstance(x, int)
|
|
|
|
|
|
computed_compile_sizes.append(x)
|
|
|
|
|
|
self.compile_sizes = computed_compile_sizes # type: ignore
|
|
|
|
|
|
|
2025-10-24 20:11:05 +08:00
|
|
|
|
# make sure the sizes are in ascending order
|
|
|
|
|
|
self.cudagraph_capture_sizes.sort()
|
|
|
|
|
|
if self.cudagraph_capture_sizes:
|
|
|
|
|
|
assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
|
2025-08-09 00:34:25 +01:00
|
|
|
|
|
2025-12-06 23:44:50 -05:00
|
|
|
|
def set_splitting_ops_for_v1(
|
2025-12-18 13:46:28 -06:00
|
|
|
|
self, all2all_backend: str, data_parallel_size: int = 1
|
2025-12-06 23:44:50 -05:00
|
|
|
|
):
|
2025-12-02 04:02:18 +08:00
|
|
|
|
# To compatible with OOT hardware plugin platform (for example vllm-ascend)
|
|
|
|
|
|
# which currently only supports sequence parallelism in eager mode.
|
|
|
|
|
|
if self.mode != CompilationMode.VLLM_COMPILE:
|
|
|
|
|
|
if self.splitting_ops is None:
|
|
|
|
|
|
self.splitting_ops = []
|
|
|
|
|
|
return
|
|
|
|
|
|
|
2025-10-14 22:51:16 -04:00
|
|
|
|
# NOTE: this function needs to be called only when mode is
|
|
|
|
|
|
# CompilationMode.VLLM_COMPILE
|
|
|
|
|
|
assert self.mode == CompilationMode.VLLM_COMPILE, (
|
2025-08-15 22:01:39 +08:00
|
|
|
|
"set_splitting_ops_for_v1 should only be called when "
|
2025-10-14 22:51:16 -04:00
|
|
|
|
"mode is CompilationMode.VLLM_COMPILE"
|
2025-08-15 22:01:39 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
2025-12-06 23:44:50 -05:00
|
|
|
|
if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
|
2025-09-27 03:58:19 +08:00
|
|
|
|
self.set_splitting_ops_for_attn_fusion()
|
2025-12-06 23:44:50 -05:00
|
|
|
|
else:
|
|
|
|
|
|
if self.splitting_ops is None:
|
|
|
|
|
|
# NOTE: When using full cudagraph, instead of setting an empty
|
|
|
|
|
|
# list and capture the full cudagraph inside the flattened fx
|
|
|
|
|
|
# graph, we keep the piecewise fx graph structure but capture
|
|
|
|
|
|
# the full cudagraph outside the fx graph. This reduces some
|
|
|
|
|
|
# cpu overhead when the runtime batch_size is not cudagraph
|
|
|
|
|
|
# captured. see https://github.com/vllm-project/vllm/pull/20059
|
|
|
|
|
|
# for details. Make a copy to avoid mutating the class-level
|
|
|
|
|
|
# list via reference.
|
|
|
|
|
|
self.splitting_ops = list(self._attention_ops)
|
2026-01-31 09:48:34 -05:00
|
|
|
|
|
|
|
|
|
|
# unified_kv_cache_update has a string param that prevents Inductor
|
|
|
|
|
|
# from reusing piecewise graphs. Remove it from the compiled graph.
|
|
|
|
|
|
# This has the side-effect of excluding cache from cudagraphs but
|
|
|
|
|
|
# that doesn't seem to affect performance.
|
|
|
|
|
|
# https://github.com/vllm-project/vllm/issues/33267
|
|
|
|
|
|
if not self.use_inductor_graph_partition:
|
|
|
|
|
|
self.splitting_ops.append("vllm::unified_kv_cache_update")
|
2026-03-02 16:43:19 +01:00
|
|
|
|
self.splitting_ops.append("vllm::unified_mla_kv_cache_update")
|
2026-01-31 09:48:34 -05:00
|
|
|
|
|
2025-12-06 23:44:50 -05:00
|
|
|
|
elif len(self.splitting_ops) == 0:
|
2025-12-14 18:49:45 -08:00
|
|
|
|
if (
|
|
|
|
|
|
self.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
|
|
|
|
|
or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
|
|
|
|
|
|
):
|
|
|
|
|
|
logger.warning_once(
|
2025-12-16 06:09:34 -08:00
|
|
|
|
"Using piecewise cudagraph with empty splitting_ops"
|
2025-12-14 18:49:45 -08:00
|
|
|
|
)
|
2025-12-06 23:44:50 -05:00
|
|
|
|
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
|
|
|
|
|
logger.warning_once(
|
2026-01-13 12:11:23 +09:00
|
|
|
|
"Piecewise compilation with empty splitting_ops does not "
|
|
|
|
|
|
"contain piecewise cudagraph. Setting cudagraph_"
|
2025-12-06 23:44:50 -05:00
|
|
|
|
"mode to NONE. Hint: If you are using attention "
|
|
|
|
|
|
"backends that support cudagraph, consider manually "
|
|
|
|
|
|
"setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
|
|
|
|
|
|
"to enable full cudagraphs."
|
|
|
|
|
|
)
|
|
|
|
|
|
self.cudagraph_mode = CUDAGraphMode.NONE
|
|
|
|
|
|
elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
|
|
|
|
|
logger.warning_once(
|
2026-01-13 12:11:23 +09:00
|
|
|
|
"Piecewise compilation with empty splitting_ops does "
|
|
|
|
|
|
"not contain piecewise cudagraph. Setting "
|
2025-12-06 23:44:50 -05:00
|
|
|
|
"cudagraph_mode to FULL."
|
|
|
|
|
|
)
|
|
|
|
|
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
|
|
|
|
|
self.splitting_ops = []
|
|
|
|
|
|
|
2025-12-18 02:50:15 -05:00
|
|
|
|
# Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
|
|
|
|
|
|
if (
|
2025-12-18 13:46:28 -06:00
|
|
|
|
all2all_backend == "deepep_high_throughput"
|
|
|
|
|
|
and data_parallel_size > 1
|
2025-12-18 02:50:15 -05:00
|
|
|
|
and self.cudagraph_mode != CUDAGraphMode.NONE
|
|
|
|
|
|
):
|
|
|
|
|
|
# TODO: Piecewise Cuda graph might be enabled
|
|
|
|
|
|
# if torch compile cache key issue fixed
|
|
|
|
|
|
# See https://github.com/vllm-project/vllm/pull/25093
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
|
|
|
|
|
|
"are optimized for prefill and are incompatible with CUDA Graphs. "
|
|
|
|
|
|
"In order to use CUDA Graphs for decode-optimized workloads, "
|
|
|
|
|
|
"use --all2all-backend with another option, such as "
|
2026-02-26 17:30:10 -05:00
|
|
|
|
"deepep_low_latency or allgather_reducescatter."
|
2025-12-06 23:44:50 -05:00
|
|
|
|
)
|
2025-12-18 02:50:15 -05:00
|
|
|
|
self.cudagraph_mode = CUDAGraphMode.NONE
|
2025-09-27 03:58:19 +08:00
|
|
|
|
|
|
|
|
|
|
def set_splitting_ops_for_attn_fusion(self):
|
2025-12-02 22:38:55 -05:00
|
|
|
|
assert self.pass_config.fuse_attn_quant
|
2025-11-12 16:10:28 +00:00
|
|
|
|
if self.splitting_ops is None:
|
|
|
|
|
|
self.splitting_ops = []
|
|
|
|
|
|
if self.cudagraph_mode.has_piecewise_cudagraphs():
|
|
|
|
|
|
logger.warning_once(
|
2025-12-02 22:38:55 -05:00
|
|
|
|
"fuse_attn_quant is incompatible with piecewise "
|
2025-11-12 16:10:28 +00:00
|
|
|
|
"cudagraph when use_inductor_graph_partition is off. "
|
|
|
|
|
|
"In this case, splitting_ops will be set to empty "
|
|
|
|
|
|
"list, and cudagraph_mode will be set to FULL. "
|
|
|
|
|
|
"Please ensure you are using attention backends that "
|
|
|
|
|
|
"support cudagraph or set cudagraph_mode to NONE "
|
|
|
|
|
|
"explicitly if encountering any problems."
|
|
|
|
|
|
)
|
|
|
|
|
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
2025-09-27 03:58:19 +08:00
|
|
|
|
|
|
|
|
|
|
assert not self.splitting_ops_contain_attention(), (
|
2025-12-02 22:38:55 -05:00
|
|
|
|
"attention ops should not be in splitting_ops when fuse_attn_quant is True"
|
2025-09-27 03:58:19 +08:00
|
|
|
|
)
|
2025-08-15 22:01:39 +08:00
|
|
|
|
|
|
|
|
|
|
def splitting_ops_contain_attention(self) -> bool:
|
|
|
|
|
|
return self.splitting_ops is not None and all(
|
|
|
|
|
|
op in self.splitting_ops for op in self._attention_ops
|
|
|
|
|
|
)
|
2025-09-19 18:02:15 -07:00
|
|
|
|
|
|
|
|
|
|
def is_attention_compiled_piecewise(self) -> bool:
|
2025-10-10 12:35:28 -04:00
|
|
|
|
if not self.splitting_ops_contain_attention():
|
|
|
|
|
|
return False
|
2025-10-05 15:06:22 +01:00
|
|
|
|
|
2025-10-10 12:35:28 -04:00
|
|
|
|
if not self.use_inductor_graph_partition:
|
|
|
|
|
|
# Dynamo-level FX split case
|
2025-10-14 22:51:16 -04:00
|
|
|
|
return self.mode == CompilationMode.VLLM_COMPILE
|
2025-09-19 18:02:15 -07:00
|
|
|
|
|
2025-10-10 12:35:28 -04:00
|
|
|
|
# Inductor partition case
|
2025-11-11 16:46:18 -08:00
|
|
|
|
return self.backend == "inductor" and self.mode != CompilationMode.NONE
|
2025-09-22 15:30:05 -04:00
|
|
|
|
|
|
|
|
|
|
def custom_op_log_check(self):
|
|
|
|
|
|
"""
|
|
|
|
|
|
This method logs the enabled/disabled custom ops and checks that the
|
|
|
|
|
|
passed custom_ops field only contains relevant ops.
|
|
|
|
|
|
It is called at the end of set_current_vllm_config,
|
|
|
|
|
|
after the custom ops have been instantiated.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0:
|
|
|
|
|
|
logger.debug("No custom ops found in model.")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
logger.debug("enabled custom ops: %s", self.enabled_custom_ops)
|
|
|
|
|
|
logger.debug("disabled custom ops: %s", self.disabled_custom_ops)
|
|
|
|
|
|
|
|
|
|
|
|
all_ops_in_model = self.enabled_custom_ops | self.disabled_custom_ops
|
|
|
|
|
|
for op in self.custom_ops:
|
|
|
|
|
|
if op in {"all", "none"}:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
assert op[0] in {"+", "-"}, (
|
|
|
|
|
|
"Invalid custom op syntax (should be checked during init)"
|
2025-10-05 15:06:22 +01:00
|
|
|
|
)
|
2025-09-22 15:30:05 -04:00
|
|
|
|
|
|
|
|
|
|
# check if op name exists in model
|
|
|
|
|
|
op_name = op[1:]
|
|
|
|
|
|
if op_name not in all_ops_in_model:
|
2026-01-22 00:38:04 +08:00
|
|
|
|
from vllm.model_executor.custom_op import op_registry
|
2025-09-22 15:30:05 -04:00
|
|
|
|
|
|
|
|
|
|
# Does op exist at all or is it just not present in this model?
|
|
|
|
|
|
# Note: Only imported op classes appear in the registry.
|
|
|
|
|
|
missing_str = (
|
|
|
|
|
|
"doesn't exist (or wasn't imported/registered)"
|
2026-01-22 00:38:04 +08:00
|
|
|
|
if op_name not in op_registry
|
2025-09-22 15:30:05 -04:00
|
|
|
|
else "not present in model"
|
2025-10-05 15:06:22 +01:00
|
|
|
|
)
|
2025-09-22 15:30:05 -04:00
|
|
|
|
|
|
|
|
|
|
enable_str = "enabling" if op[0] == "+" else "disabling"
|
|
|
|
|
|
logger.warning_once(
|
|
|
|
|
|
"Op '%s' %s, %s with '%s' has no effect",
|
|
|
|
|
|
op_name,
|
|
|
|
|
|
missing_str,
|
|
|
|
|
|
enable_str,
|
|
|
|
|
|
op,
|
|
|
|
|
|
)
|
2025-11-17 09:41:22 -05:00
|
|
|
|
|
2025-11-27 04:55:58 -05:00
|
|
|
|
def is_custom_op_enabled(self, op: str) -> bool:
|
|
|
|
|
|
if "all" in self.custom_ops:
|
|
|
|
|
|
return f"-{op}" not in self.custom_ops
|
|
|
|
|
|
|
|
|
|
|
|
assert "none" in self.custom_ops
|
|
|
|
|
|
return f"+{op}" in self.custom_ops
|
|
|
|
|
|
|
2025-11-17 09:41:22 -05:00
|
|
|
|
def adjust_cudagraph_sizes_for_spec_decode(
|
|
|
|
|
|
self, uniform_decode_query_len: int, tensor_parallel_size: int
|
|
|
|
|
|
):
|
|
|
|
|
|
multiple_of = uniform_decode_query_len
|
2025-12-02 22:38:55 -05:00
|
|
|
|
if tensor_parallel_size > 1 and self.pass_config.enable_sp:
|
2025-11-17 09:41:22 -05:00
|
|
|
|
multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
|
|
|
|
|
|
if (
|
|
|
|
|
|
multiple_of % uniform_decode_query_len != 0
|
|
|
|
|
|
or multiple_of % tensor_parallel_size != 0
|
|
|
|
|
|
):
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Can't determine cudagraph shapes that are both a "
|
|
|
|
|
|
f"multiple of {uniform_decode_query_len} "
|
|
|
|
|
|
f"(num_speculative_tokens + 1) required by spec-decode "
|
|
|
|
|
|
f"and {tensor_parallel_size} (tensor_parallel_size) "
|
|
|
|
|
|
f"required by sequence parallelism please adjust "
|
|
|
|
|
|
f"num_speculative_tokens or disable sequence parallelism"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if not self.cudagraph_capture_sizes or multiple_of <= 1:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
assert self.max_cudagraph_capture_size is not None
|
|
|
|
|
|
rounded_sizes = sorted(
|
|
|
|
|
|
set(
|
|
|
|
|
|
round_up(size, multiple_of)
|
|
|
|
|
|
for size in self.cudagraph_capture_sizes
|
|
|
|
|
|
if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-11-21 19:47:05 -05:00
|
|
|
|
if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
|
|
|
|
|
|
# if one valid but would be round_down use that
|
|
|
|
|
|
rounded_sizes = [multiple_of]
|
|
|
|
|
|
|
2025-11-17 09:41:22 -05:00
|
|
|
|
if len(rounded_sizes) == 0:
|
2025-11-21 19:47:05 -05:00
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
|
|
|
|
|
|
f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
|
|
|
|
|
|
f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
|
|
|
|
|
|
f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
|
|
|
|
|
|
f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
|
2025-11-17 09:41:22 -05:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
self.max_cudagraph_capture_size = rounded_sizes[-1]
|
|
|
|
|
|
self.cudagraph_capture_sizes = rounded_sizes
|
|
|
|
|
|
|
2026-03-04 18:56:22 +08:00
|
|
|
|
def adjust_cudagraph_sizes_for_mamba_cache(
|
|
|
|
|
|
self, num_mamba_cache_blocks: int
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
"""Cap cudagraph capture sizes to available Mamba cache blocks.
|
|
|
|
|
|
|
|
|
|
|
|
For hybrid Mamba/attention models, the Mamba conv_state and
|
|
|
|
|
|
ssm_state tensors have their first dimension equal to num_blocks
|
|
|
|
|
|
(from KVCacheConfig). During CUDA graph capture the decode batch
|
|
|
|
|
|
size equals num_tokens, so capture sizes exceeding num_blocks
|
|
|
|
|
|
would cause out-of-bounds access in Mamba kernels.
|
|
|
|
|
|
|
|
|
|
|
|
See: https://github.com/vllm-project/vllm/issues/34094
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not self.cudagraph_capture_sizes or num_mamba_cache_blocks <= 0:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
assert self.max_cudagraph_capture_size is not None
|
|
|
|
|
|
|
|
|
|
|
|
if num_mamba_cache_blocks >= self.max_cudagraph_capture_size:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
capped_sizes = [
|
|
|
|
|
|
s for s in self.cudagraph_capture_sizes if s <= num_mamba_cache_blocks
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
if len(capped_sizes) == 0:
|
|
|
|
|
|
logger.warning(
|
|
|
|
|
|
"No valid cudagraph capture sizes remain after capping "
|
|
|
|
|
|
"to Mamba cache blocks (%d). The smallest capture size "
|
|
|
|
|
|
"was %d. Disabling cudagraph capture. Consider reducing "
|
|
|
|
|
|
"max_num_seqs or increasing available GPU memory.",
|
|
|
|
|
|
num_mamba_cache_blocks,
|
|
|
|
|
|
self.cudagraph_capture_sizes[0],
|
|
|
|
|
|
)
|
|
|
|
|
|
self.cudagraph_capture_sizes = []
|
|
|
|
|
|
self.max_cudagraph_capture_size = 0
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
logger.warning(
|
|
|
|
|
|
"Capping cudagraph capture sizes from max %d to %d to fit "
|
|
|
|
|
|
"Mamba cache blocks (%d blocks available). This limits the "
|
|
|
|
|
|
"maximum batch size that can use CUDA graphs. To increase "
|
|
|
|
|
|
"this limit, reduce max_num_seqs or increase available GPU "
|
|
|
|
|
|
"memory.",
|
|
|
|
|
|
self.max_cudagraph_capture_size,
|
|
|
|
|
|
capped_sizes[-1],
|
|
|
|
|
|
num_mamba_cache_blocks,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
self.max_cudagraph_capture_size = capped_sizes[-1]
|
|
|
|
|
|
self.cudagraph_capture_sizes = capped_sizes
|
|
|
|
|
|
|
2025-12-05 19:17:32 +01:00
|
|
|
|
def get_compile_ranges(self) -> list[Range]:
|
|
|
|
|
|
"""Get the compile ranges for the compilation config."""
|
|
|
|
|
|
if self.compile_ranges_split_points is None:
|
|
|
|
|
|
return []
|
|
|
|
|
|
split_points = sorted(set(self.compile_ranges_split_points))
|
|
|
|
|
|
return [
|
|
|
|
|
|
Range(start=s + 1, end=e)
|
|
|
|
|
|
for s, e in zip([0] + split_points[:-1], split_points)
|
|
|
|
|
|
]
|