Remove deprecated fields from CompilationConfig (#27593)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -206,7 +206,6 @@ class CompilationConfig:
|
||||
- [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
|
||||
- [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
|
||||
- CudaGraph capture:
|
||||
- [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
|
||||
- [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
|
||||
- [`cudagraph_capture_sizes`]
|
||||
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
|
||||
@@ -216,7 +215,6 @@ class CompilationConfig:
|
||||
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
|
||||
- [`cudagraph_copy_inputs`]
|
||||
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
|
||||
- [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
|
||||
- Inductor compilation:
|
||||
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
|
||||
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
|
||||
@@ -396,18 +394,6 @@ class CompilationConfig:
|
||||
Warning: This flag is new and subject to change in addition
|
||||
more modes may be added.
|
||||
"""
|
||||
use_cudagraph: bool = True
|
||||
"""Whether to use cudagraph inside compilation:
|
||||
|
||||
- False: cudagraph inside compilation is not used.\n
|
||||
- True: cudagraph inside compilation is used. It requires
|
||||
that all input buffers have fixed addresses, and all
|
||||
splitting ops write their outputs to input buffers.
|
||||
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
|
||||
_PIECEWISE instead.
|
||||
"""
|
||||
cudagraph_num_of_warmups: int = 0
|
||||
"""Number of warmup runs for cudagraph.
|
||||
It means the first several runs will be treated as warmup runs.
|
||||
@@ -425,15 +411,6 @@ class CompilationConfig:
|
||||
internally managed buffer. Default is False.
|
||||
Note that this flag is only effective when cudagraph_mode is PIECEWISE.
|
||||
"""
|
||||
full_cuda_graph: bool | None = False
|
||||
"""whether to use a full cuda graph for the entire forward pass rather than
|
||||
splitting certain operations such as attention into subgraphs. Thus this
|
||||
flag cannot be used together with splitting_ops. This may provide
|
||||
performance benefits for smaller models.
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
|
||||
FULL_AND_PIECEWISE instead.
|
||||
"""
|
||||
cudagraph_specialize_lora: bool = True
|
||||
"""Whether to create separate cuda graphs for cases with and without active
|
||||
LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
|
||||
@@ -603,13 +580,19 @@ class CompilationConfig:
|
||||
@field_validator("cudagraph_mode", mode="before")
|
||||
@classmethod
|
||||
def validate_cudagraph_mode_before(cls, value: Any) -> Any:
|
||||
"""
|
||||
enable parse the `cudagraph_mode` enum type from string
|
||||
"""
|
||||
"""Enable parsing of the `cudagraph_mode` enum type from string."""
|
||||
if isinstance(value, str):
|
||||
return CUDAGraphMode[value.upper()]
|
||||
return value
|
||||
|
||||
@field_validator("pass_config", mode="before")
|
||||
@classmethod
|
||||
def validate_pass_config_before(cls, value: Any) -> Any:
|
||||
"""Enable parsing of the `pass_config` field from a dictionary."""
|
||||
if isinstance(value, dict):
|
||||
return PassConfig(**value)
|
||||
return value
|
||||
|
||||
@field_validator("compile_cache_save_format")
|
||||
@classmethod
|
||||
def validate_compile_cache_save_format(cls, value: str) -> str:
|
||||
@@ -666,9 +649,6 @@ class CompilationConfig:
|
||||
func if isinstance(func, InductorPass) else CallableInductorPass(func)
|
||||
)
|
||||
|
||||
if isinstance(self.pass_config, dict):
|
||||
self.pass_config = PassConfig(**self.pass_config)
|
||||
|
||||
if self.pass_config.enable_qk_norm_rope_fusion:
|
||||
# TODO(zhuhaoran): support rope native forward match and remove this.
|
||||
# Linked issue: https://github.com/vllm-project/vllm/issues/28042
|
||||
@@ -684,36 +664,6 @@ class CompilationConfig:
|
||||
self.inductor_compile_config["combo_kernels"] = True
|
||||
self.inductor_compile_config["benchmark_combo_kernel"] = True
|
||||
|
||||
# migrate the deprecated flags
|
||||
if not self.use_cudagraph:
|
||||
logger.warning(
|
||||
"use_cudagraph is deprecated, use cudagraph_mode=NONE instead."
|
||||
)
|
||||
if (
|
||||
self.cudagraph_mode is not None
|
||||
and self.cudagraph_mode != CUDAGraphMode.NONE
|
||||
):
|
||||
raise ValueError(
|
||||
"use_cudagraph and cudagraph_mode are mutually"
|
||||
" exclusive, prefer cudagraph_mode since "
|
||||
"use_cudagraph is deprecated."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
if self.full_cuda_graph:
|
||||
logger.warning(
|
||||
"full_cuda_graph is deprecated, use cudagraph_mode=FULL instead."
|
||||
)
|
||||
if (
|
||||
self.cudagraph_mode is not None
|
||||
and not self.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
raise ValueError(
|
||||
"full_cuda_graph and cudagraph_mode are "
|
||||
"mutually exclusive, prefer cudagraph_mode "
|
||||
"since full_cuda_graph is deprecated."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
|
||||
if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
|
||||
"2.9.0.dev"
|
||||
):
|
||||
@@ -891,20 +841,19 @@ class CompilationConfig:
|
||||
|
||||
def set_splitting_ops_for_attn_fusion(self):
|
||||
assert self.pass_config.enable_attn_fusion
|
||||
# For dynamo-partition (non-inductor) attention fusion,
|
||||
# set splitting_ops to empty to avoid splitting at attention ops
|
||||
self.splitting_ops = []
|
||||
if self.cudagraph_mode.has_piecewise_cudagraphs():
|
||||
logger.warning_once(
|
||||
"enable_attn_fusion is incompatible with piecewise "
|
||||
"cudagraph when use_inductor_graph_partition is off. "
|
||||
"In this case, splitting_ops will be set to empty "
|
||||
"list, and cudagraph_mode will be set to FULL. "
|
||||
"Please ensure you are using attention backends that "
|
||||
"support cudagraph or set cudagraph_mode to NONE "
|
||||
"explicitly if encountering any problems."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
if self.splitting_ops is None:
|
||||
self.splitting_ops = []
|
||||
if self.cudagraph_mode.has_piecewise_cudagraphs():
|
||||
logger.warning_once(
|
||||
"enable_attn_fusion is incompatible with piecewise "
|
||||
"cudagraph when use_inductor_graph_partition is off. "
|
||||
"In this case, splitting_ops will be set to empty "
|
||||
"list, and cudagraph_mode will be set to FULL. "
|
||||
"Please ensure you are using attention backends that "
|
||||
"support cudagraph or set cudagraph_mode to NONE "
|
||||
"explicitly if encountering any problems."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
|
||||
assert not self.splitting_ops_contain_attention(), (
|
||||
"attention ops should not be in splitting_ops "
|
||||
|
||||
Reference in New Issue
Block a user