Remove deprecated fields from CompilationConfig (#27593)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-11-12 16:10:28 +00:00
parent 728a9eb70e
commit a742134cc5
13 changed files with 122 additions and 164 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -206,7 +206,6 @@ class CompilationConfig:
        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
        - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
    - CudaGraph capture:
-        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
        - [`cudagraph_capture_sizes`]
        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
@@ -216,7 +215,6 @@ class CompilationConfig:
        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
        - [`cudagraph_copy_inputs`]
        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
-        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
    - Inductor compilation:
        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
@@ -396,18 +394,6 @@ class CompilationConfig:
    Warning: This flag is new and subject to change in addition
    more modes may be added.
    """
-    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation:
-
-    - False: cudagraph inside compilation is not used.\n
-    - True: cudagraph inside compilation is used. It requires
-        that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
-    _PIECEWISE instead.
-    """
    cudagraph_num_of_warmups: int = 0
    """Number of warmup runs for cudagraph.
    It means the first several runs will be treated as warmup runs.
@@ -425,15 +411,6 @@ class CompilationConfig:
    internally managed buffer. Default is False.
    Note that this flag is only effective when cudagraph_mode is PIECEWISE.
    """
-    full_cuda_graph: bool | None = False
-    """whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs. Thus this
-    flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models.
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
-    FULL_AND_PIECEWISE instead.
-    """
    cudagraph_specialize_lora: bool = True
    """Whether to create separate cuda graphs for cases with and without active
    LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
@@ -603,13 +580,19 @@ class CompilationConfig:
    @field_validator("cudagraph_mode", mode="before")
    @classmethod
    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
-        """
-        enable parse the `cudagraph_mode` enum type from string
-        """
+        """Enable parsing of the `cudagraph_mode` enum type from string."""
        if isinstance(value, str):
            return CUDAGraphMode[value.upper()]
        return value

+    @field_validator("pass_config", mode="before")
+    @classmethod
+    def validate_pass_config_before(cls, value: Any) -> Any:
+        """Enable parsing of the `pass_config` field from a dictionary."""
+        if isinstance(value, dict):
+            return PassConfig(**value)
+        return value
+
    @field_validator("compile_cache_save_format")
    @classmethod
    def validate_compile_cache_save_format(cls, value: str) -> str:
@@ -666,9 +649,6 @@ class CompilationConfig:
                func if isinstance(func, InductorPass) else CallableInductorPass(func)
            )

-        if isinstance(self.pass_config, dict):
-            self.pass_config = PassConfig(**self.pass_config)
-
        if self.pass_config.enable_qk_norm_rope_fusion:
            # TODO(zhuhaoran): support rope native forward match and remove this.
            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
@@ -684,36 +664,6 @@ class CompilationConfig:
            self.inductor_compile_config["combo_kernels"] = True
            self.inductor_compile_config["benchmark_combo_kernel"] = True

-        # migrate the deprecated flags
-        if not self.use_cudagraph:
-            logger.warning(
-                "use_cudagraph is deprecated, use cudagraph_mode=NONE instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and self.cudagraph_mode != CUDAGraphMode.NONE
-            ):
-                raise ValueError(
-                    "use_cudagraph and cudagraph_mode are mutually"
-                    " exclusive, prefer cudagraph_mode since "
-                    "use_cudagraph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.NONE
-        if self.full_cuda_graph:
-            logger.warning(
-                "full_cuda_graph is deprecated, use cudagraph_mode=FULL instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and not self.cudagraph_mode.has_full_cudagraphs()
-            ):
-                raise ValueError(
-                    "full_cuda_graph and cudagraph_mode are "
-                    "mutually exclusive, prefer cudagraph_mode "
-                    "since full_cuda_graph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.FULL
-
        if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
            "2.9.0.dev"
        ):
@@ -891,20 +841,19 @@ class CompilationConfig:

    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.enable_attn_fusion
-        # For dynamo-partition (non-inductor) attention fusion,
-        # set splitting_ops to empty to avoid splitting at attention ops
-        self.splitting_ops = []
-        if self.cudagraph_mode.has_piecewise_cudagraphs():
-            logger.warning_once(
-                "enable_attn_fusion is incompatible with piecewise "
-                "cudagraph when use_inductor_graph_partition is off. "
-                "In this case, splitting_ops will be set to empty "
-                "list, and cudagraph_mode will be set to FULL. "
-                "Please ensure you are using attention backends that "
-                "support cudagraph or set cudagraph_mode to NONE "
-                "explicitly if encountering any problems."
-            )
-            self.cudagraph_mode = CUDAGraphMode.FULL
+        if self.splitting_ops is None:
+            self.splitting_ops = []
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
+                logger.warning_once(
+                    "enable_attn_fusion is incompatible with piecewise "
+                    "cudagraph when use_inductor_graph_partition is off. "
+                    "In this case, splitting_ops will be set to empty "
+                    "list, and cudagraph_mode will be set to FULL. "
+                    "Please ensure you are using attention backends that "
+                    "support cudagraph or set cudagraph_mode to NONE "
+                    "explicitly if encountering any problems."
+                )
+                self.cudagraph_mode = CUDAGraphMode.FULL

        assert not self.splitting_ops_contain_attention(), (
            "attention ops should not be in splitting_ops "