Remove deprecated fields from CompilationConfig (#27593)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-11-12 16:10:28 +00:00
committed by GitHub
parent 728a9eb70e
commit a742134cc5
13 changed files with 122 additions and 164 deletions

View File

@@ -206,7 +206,6 @@ class CompilationConfig:
- [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
- [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
- CudaGraph capture:
- [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
- [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
- [`cudagraph_capture_sizes`]
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
@@ -216,7 +215,6 @@ class CompilationConfig:
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
- [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
- [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
- Inductor compilation:
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
@@ -396,18 +394,6 @@ class CompilationConfig:
Warning: This flag is new and subject to change in addition
more modes may be added.
"""
use_cudagraph: bool = True
"""Whether to use cudagraph inside compilation:
- False: cudagraph inside compilation is not used.\n
- True: cudagraph inside compilation is used. It requires
that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
Warning: This flag is deprecated and will be removed in the next major or
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
_PIECEWISE instead.
"""
cudagraph_num_of_warmups: int = 0
"""Number of warmup runs for cudagraph.
It means the first several runs will be treated as warmup runs.
@@ -425,15 +411,6 @@ class CompilationConfig:
internally managed buffer. Default is False.
Note that this flag is only effective when cudagraph_mode is PIECEWISE.
"""
full_cuda_graph: bool | None = False
"""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs. Thus this
flag cannot be used together with splitting_ops. This may provide
performance benefits for smaller models.
Warning: This flag is deprecated and will be removed in the next major or
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
FULL_AND_PIECEWISE instead.
"""
cudagraph_specialize_lora: bool = True
"""Whether to create separate cuda graphs for cases with and without active
LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
@@ -603,13 +580,19 @@ class CompilationConfig:
@field_validator("cudagraph_mode", mode="before")
@classmethod
def validate_cudagraph_mode_before(cls, value: Any) -> Any:
"""
enable parse the `cudagraph_mode` enum type from string
"""
"""Enable parsing of the `cudagraph_mode` enum type from string."""
if isinstance(value, str):
return CUDAGraphMode[value.upper()]
return value
@field_validator("pass_config", mode="before")
@classmethod
def validate_pass_config_before(cls, value: Any) -> Any:
"""Enable parsing of the `pass_config` field from a dictionary."""
if isinstance(value, dict):
return PassConfig(**value)
return value
@field_validator("compile_cache_save_format")
@classmethod
def validate_compile_cache_save_format(cls, value: str) -> str:
@@ -666,9 +649,6 @@ class CompilationConfig:
func if isinstance(func, InductorPass) else CallableInductorPass(func)
)
if isinstance(self.pass_config, dict):
self.pass_config = PassConfig(**self.pass_config)
if self.pass_config.enable_qk_norm_rope_fusion:
# TODO(zhuhaoran): support rope native forward match and remove this.
# Linked issue: https://github.com/vllm-project/vllm/issues/28042
@@ -684,36 +664,6 @@ class CompilationConfig:
self.inductor_compile_config["combo_kernels"] = True
self.inductor_compile_config["benchmark_combo_kernel"] = True
# migrate the deprecated flags
if not self.use_cudagraph:
logger.warning(
"use_cudagraph is deprecated, use cudagraph_mode=NONE instead."
)
if (
self.cudagraph_mode is not None
and self.cudagraph_mode != CUDAGraphMode.NONE
):
raise ValueError(
"use_cudagraph and cudagraph_mode are mutually"
" exclusive, prefer cudagraph_mode since "
"use_cudagraph is deprecated."
)
self.cudagraph_mode = CUDAGraphMode.NONE
if self.full_cuda_graph:
logger.warning(
"full_cuda_graph is deprecated, use cudagraph_mode=FULL instead."
)
if (
self.cudagraph_mode is not None
and not self.cudagraph_mode.has_full_cudagraphs()
):
raise ValueError(
"full_cuda_graph and cudagraph_mode are "
"mutually exclusive, prefer cudagraph_mode "
"since full_cuda_graph is deprecated."
)
self.cudagraph_mode = CUDAGraphMode.FULL
if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
"2.9.0.dev"
):
@@ -891,20 +841,19 @@ class CompilationConfig:
def set_splitting_ops_for_attn_fusion(self):
assert self.pass_config.enable_attn_fusion
# For dynamo-partition (non-inductor) attention fusion,
# set splitting_ops to empty to avoid splitting at attention ops
self.splitting_ops = []
if self.cudagraph_mode.has_piecewise_cudagraphs():
logger.warning_once(
"enable_attn_fusion is incompatible with piecewise "
"cudagraph when use_inductor_graph_partition is off. "
"In this case, splitting_ops will be set to empty "
"list, and cudagraph_mode will be set to FULL. "
"Please ensure you are using attention backends that "
"support cudagraph or set cudagraph_mode to NONE "
"explicitly if encountering any problems."
)
self.cudagraph_mode = CUDAGraphMode.FULL
if self.splitting_ops is None:
self.splitting_ops = []
if self.cudagraph_mode.has_piecewise_cudagraphs():
logger.warning_once(
"enable_attn_fusion is incompatible with piecewise "
"cudagraph when use_inductor_graph_partition is off. "
"In this case, splitting_ops will be set to empty "
"list, and cudagraph_mode will be set to FULL. "
"Please ensure you are using attention backends that "
"support cudagraph or set cudagraph_mode to NONE "
"explicitly if encountering any problems."
)
self.cudagraph_mode = CUDAGraphMode.FULL
assert not self.splitting_ops_contain_attention(), (
"attention ops should not be in splitting_ops "