[MISC] cudagraph_capture_sizes related improvements (#26016)

Signed-off-by: fhl <2410591650@qq.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-24 20:11:05 +08:00
parent 435be10db9
commit 284cc92275
14 changed files with 303 additions and 110 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -154,6 +154,8 @@ class CompilationConfig:
        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
        - [`cudagraph_capture_sizes`]
        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`max_cudagraph_capture_size`]
+        [vllm.config.CompilationConfig.max_cudagraph_capture_size]
        - [`cudagraph_num_of_warmups`]
        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
        - [`cudagraph_copy_inputs`]
@@ -327,18 +329,16 @@ class CompilationConfig:
    more modes may be added.
    """
    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation.
-    - False: cudagraph inside compilation is not used.
+    """Whether to use cudagraph inside compilation:
+
+    - False: cudagraph inside compilation is not used.\n
    - True: cudagraph inside compilation is used. It requires
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
-    In the vLLM V1 Engine, this flag only applies for
-    CompilationMode.VLLM_COMPILE (aka -O3).
-    Note that this is orthogonal to the cudagraph capture logic
-    outside of compilation.
+
    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
-    instead.
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
+    _PIECEWISE instead.
    """
    cudagraph_num_of_warmups: int = 0
    """Number of warmup runs for cudagraph.
@@ -398,8 +398,22 @@ class CompilationConfig:
    pass_config: PassConfig = field(default_factory=PassConfig)
    """Custom inductor passes, see PassConfig for more details"""

-    max_capture_size: int = field(default=None, init=False)  # type: ignore
-    """not configurable, computed after init"""
+    max_cudagraph_capture_size: int | None = field(default=None)
+    """The maximum cudagraph capture size.
+    
+    If cudagraph_capture_sizes is specified, this will be set to the largest 
+    size in that list (or checked for consistency if specified). If
+    cudagraph_capture_sizes is not specified, the list of sizes is generated
+    automatically following the pattern:
+
+        [1, 2, 4] + list(range(8, 256, 8)) + list(
+        range(256, max_cudagraph_capture_size + 1, 16))
+
+    If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
+    512) by default. This voids OOM in tight memory scenarios with small 
+    max_num_seqs, and prevents capture of many large graphs (>512) that would
+    greatly increase startup time with limited performance benefit.
+    """
    local_cache_dir: str = field(default=None, init=False)  # type: ignore
    """local cache dir for each rank"""
    bs_to_padded_graph_size: list[int] = field(
@@ -408,7 +422,7 @@ class CompilationConfig:
    )
    """optimization:
    Intuitively, bs_to_padded_graph_size should be dict[int, int].
-    since we know all keys are in a range [0, max_capture_size],
+    since we know all keys are in a range [0, max_cudagraph_capture_size],
    we can optimize it to list[int] for better lookup performance."""

    # keep track of enabled and disabled custom ops
@@ -672,25 +686,12 @@ class CompilationConfig:

        return VllmBackend(vllm_config)

-    def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
-        """To complete the initialization of config,
-        we need to know the cudagraph sizes."""
-
-        if self.cudagraph_capture_sizes is None:
-            self.cudagraph_capture_sizes = cudagraph_capture_sizes
-        else:
-            # de-duplicate the sizes provided by the config
-            dedup_sizes = list(set(self.cudagraph_capture_sizes))
-            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
-                logger.info(
-                    (
-                        "cudagraph sizes specified by model runner"
-                        " %s is overridden by config %s"
-                    ),
-                    cudagraph_capture_sizes,
-                    dedup_sizes,
-                )
-            self.cudagraph_capture_sizes = dedup_sizes
+    def post_init_cudagraph_sizes(self) -> None:
+        """To complete the initialization after cudagraph related
+        configs are set. This includes:
+        - initialize compile_sizes
+        - pre-compute the mapping bs_to_padded_graph_size
+        """

        computed_compile_sizes = []
        if self.compile_sizes is not None:
@@ -708,23 +709,24 @@ class CompilationConfig:
                    computed_compile_sizes.append(x)
        self.compile_sizes = computed_compile_sizes  # type: ignore

-        # sort to make sure cudagraph capture sizes are in descending order
-        self.cudagraph_capture_sizes.sort(reverse=True)
-        self.max_capture_size = (
-            self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
-        )
+        # make sure the sizes are in ascending order
+        self.cudagraph_capture_sizes.sort()
+        if self.cudagraph_capture_sizes:
+            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size

        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [0 for i in range(self.max_capture_size + 1)]
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
        for end, start in zip(
-            self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
        ):
            for bs in range(start, end):
                if bs == start:
                    self.bs_to_padded_graph_size[bs] = start
                else:
                    self.bs_to_padded_graph_size[bs] = end
-        self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size

    def set_splitting_ops_for_v1(self):
        # NOTE: this function needs to be called only when mode is