[MISC] cudagraph_capture_sizes related improvements (#26016)

Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
fhl2000
2025-10-24 20:11:05 +08:00
committed by GitHub
parent 435be10db9
commit 284cc92275
14 changed files with 303 additions and 110 deletions

View File

@@ -154,6 +154,8 @@ class CompilationConfig:
- [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
- [`cudagraph_capture_sizes`]
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
- [`max_cudagraph_capture_size`]
[vllm.config.CompilationConfig.max_cudagraph_capture_size]
- [`cudagraph_num_of_warmups`]
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
- [`cudagraph_copy_inputs`]
@@ -327,18 +329,16 @@ class CompilationConfig:
more modes may be added.
"""
use_cudagraph: bool = True
"""Whether to use cudagraph inside compilation.
- False: cudagraph inside compilation is not used.
"""Whether to use cudagraph inside compilation:
- False: cudagraph inside compilation is not used.\n
- True: cudagraph inside compilation is used. It requires
that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
In the vLLM V1 Engine, this flag only applies for
CompilationMode.VLLM_COMPILE (aka -O3).
Note that this is orthogonal to the cudagraph capture logic
outside of compilation.
Warning: This flag is deprecated and will be removed in the next major or
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
instead.
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
_PIECEWISE instead.
"""
cudagraph_num_of_warmups: int = 0
"""Number of warmup runs for cudagraph.
@@ -398,8 +398,22 @@ class CompilationConfig:
pass_config: PassConfig = field(default_factory=PassConfig)
"""Custom inductor passes, see PassConfig for more details"""
max_capture_size: int = field(default=None, init=False) # type: ignore
"""not configurable, computed after init"""
max_cudagraph_capture_size: int | None = field(default=None)
"""The maximum cudagraph capture size.
If cudagraph_capture_sizes is specified, this will be set to the largest
size in that list (or checked for consistency if specified). If
cudagraph_capture_sizes is not specified, the list of sizes is generated
automatically following the pattern:
[1, 2, 4] + list(range(8, 256, 8)) + list(
range(256, max_cudagraph_capture_size + 1, 16))
If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
512) by default. This voids OOM in tight memory scenarios with small
max_num_seqs, and prevents capture of many large graphs (>512) that would
greatly increase startup time with limited performance benefit.
"""
local_cache_dir: str = field(default=None, init=False) # type: ignore
"""local cache dir for each rank"""
bs_to_padded_graph_size: list[int] = field(
@@ -408,7 +422,7 @@ class CompilationConfig:
)
"""optimization:
Intuitively, bs_to_padded_graph_size should be dict[int, int].
since we know all keys are in a range [0, max_capture_size],
since we know all keys are in a range [0, max_cudagraph_capture_size],
we can optimize it to list[int] for better lookup performance."""
# keep track of enabled and disabled custom ops
@@ -672,25 +686,12 @@ class CompilationConfig:
return VllmBackend(vllm_config)
def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
"""To complete the initialization of config,
we need to know the cudagraph sizes."""
if self.cudagraph_capture_sizes is None:
self.cudagraph_capture_sizes = cudagraph_capture_sizes
else:
# de-duplicate the sizes provided by the config
dedup_sizes = list(set(self.cudagraph_capture_sizes))
if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
logger.info(
(
"cudagraph sizes specified by model runner"
" %s is overridden by config %s"
),
cudagraph_capture_sizes,
dedup_sizes,
)
self.cudagraph_capture_sizes = dedup_sizes
def post_init_cudagraph_sizes(self) -> None:
"""To complete the initialization after cudagraph related
configs are set. This includes:
- initialize compile_sizes
- pre-compute the mapping bs_to_padded_graph_size
"""
computed_compile_sizes = []
if self.compile_sizes is not None:
@@ -708,23 +709,24 @@ class CompilationConfig:
computed_compile_sizes.append(x)
self.compile_sizes = computed_compile_sizes # type: ignore
# sort to make sure cudagraph capture sizes are in descending order
self.cudagraph_capture_sizes.sort(reverse=True)
self.max_capture_size = (
self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
)
# make sure the sizes are in ascending order
self.cudagraph_capture_sizes.sort()
if self.cudagraph_capture_sizes:
assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
# pre-compute the mapping from batch size to padded graph size
self.bs_to_padded_graph_size = [0 for i in range(self.max_capture_size + 1)]
self.bs_to_padded_graph_size = [
0 for i in range(self.max_cudagraph_capture_size + 1)
]
for end, start in zip(
self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]
self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
[0] + self.cudagraph_capture_sizes,
):
for bs in range(start, end):
if bs == start:
self.bs_to_padded_graph_size[bs] = start
else:
self.bs_to_padded_graph_size[bs] = end
self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
def set_splitting_ops_for_v1(self):
# NOTE: this function needs to be called only when mode is