[MISC] cudagraph_capture_sizes related improvements (#26016)
Signed-off-by: fhl <2410591650@qq.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -154,6 +154,8 @@ class CompilationConfig:
|
||||
- [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
|
||||
- [`cudagraph_capture_sizes`]
|
||||
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
|
||||
- [`max_cudagraph_capture_size`]
|
||||
[vllm.config.CompilationConfig.max_cudagraph_capture_size]
|
||||
- [`cudagraph_num_of_warmups`]
|
||||
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
|
||||
- [`cudagraph_copy_inputs`]
|
||||
@@ -327,18 +329,16 @@ class CompilationConfig:
|
||||
more modes may be added.
|
||||
"""
|
||||
use_cudagraph: bool = True
|
||||
"""Whether to use cudagraph inside compilation.
|
||||
- False: cudagraph inside compilation is not used.
|
||||
"""Whether to use cudagraph inside compilation:
|
||||
|
||||
- False: cudagraph inside compilation is not used.\n
|
||||
- True: cudagraph inside compilation is used. It requires
|
||||
that all input buffers have fixed addresses, and all
|
||||
splitting ops write their outputs to input buffers.
|
||||
In the vLLM V1 Engine, this flag only applies for
|
||||
CompilationMode.VLLM_COMPILE (aka -O3).
|
||||
Note that this is orthogonal to the cudagraph capture logic
|
||||
outside of compilation.
|
||||
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
|
||||
instead.
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
|
||||
_PIECEWISE instead.
|
||||
"""
|
||||
cudagraph_num_of_warmups: int = 0
|
||||
"""Number of warmup runs for cudagraph.
|
||||
@@ -398,8 +398,22 @@ class CompilationConfig:
|
||||
pass_config: PassConfig = field(default_factory=PassConfig)
|
||||
"""Custom inductor passes, see PassConfig for more details"""
|
||||
|
||||
max_capture_size: int = field(default=None, init=False) # type: ignore
|
||||
"""not configurable, computed after init"""
|
||||
max_cudagraph_capture_size: int | None = field(default=None)
|
||||
"""The maximum cudagraph capture size.
|
||||
|
||||
If cudagraph_capture_sizes is specified, this will be set to the largest
|
||||
size in that list (or checked for consistency if specified). If
|
||||
cudagraph_capture_sizes is not specified, the list of sizes is generated
|
||||
automatically following the pattern:
|
||||
|
||||
[1, 2, 4] + list(range(8, 256, 8)) + list(
|
||||
range(256, max_cudagraph_capture_size + 1, 16))
|
||||
|
||||
If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
|
||||
512) by default. This voids OOM in tight memory scenarios with small
|
||||
max_num_seqs, and prevents capture of many large graphs (>512) that would
|
||||
greatly increase startup time with limited performance benefit.
|
||||
"""
|
||||
local_cache_dir: str = field(default=None, init=False) # type: ignore
|
||||
"""local cache dir for each rank"""
|
||||
bs_to_padded_graph_size: list[int] = field(
|
||||
@@ -408,7 +422,7 @@ class CompilationConfig:
|
||||
)
|
||||
"""optimization:
|
||||
Intuitively, bs_to_padded_graph_size should be dict[int, int].
|
||||
since we know all keys are in a range [0, max_capture_size],
|
||||
since we know all keys are in a range [0, max_cudagraph_capture_size],
|
||||
we can optimize it to list[int] for better lookup performance."""
|
||||
|
||||
# keep track of enabled and disabled custom ops
|
||||
@@ -672,25 +686,12 @@ class CompilationConfig:
|
||||
|
||||
return VllmBackend(vllm_config)
|
||||
|
||||
def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
|
||||
"""To complete the initialization of config,
|
||||
we need to know the cudagraph sizes."""
|
||||
|
||||
if self.cudagraph_capture_sizes is None:
|
||||
self.cudagraph_capture_sizes = cudagraph_capture_sizes
|
||||
else:
|
||||
# de-duplicate the sizes provided by the config
|
||||
dedup_sizes = list(set(self.cudagraph_capture_sizes))
|
||||
if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
|
||||
logger.info(
|
||||
(
|
||||
"cudagraph sizes specified by model runner"
|
||||
" %s is overridden by config %s"
|
||||
),
|
||||
cudagraph_capture_sizes,
|
||||
dedup_sizes,
|
||||
)
|
||||
self.cudagraph_capture_sizes = dedup_sizes
|
||||
def post_init_cudagraph_sizes(self) -> None:
|
||||
"""To complete the initialization after cudagraph related
|
||||
configs are set. This includes:
|
||||
- initialize compile_sizes
|
||||
- pre-compute the mapping bs_to_padded_graph_size
|
||||
"""
|
||||
|
||||
computed_compile_sizes = []
|
||||
if self.compile_sizes is not None:
|
||||
@@ -708,23 +709,24 @@ class CompilationConfig:
|
||||
computed_compile_sizes.append(x)
|
||||
self.compile_sizes = computed_compile_sizes # type: ignore
|
||||
|
||||
# sort to make sure cudagraph capture sizes are in descending order
|
||||
self.cudagraph_capture_sizes.sort(reverse=True)
|
||||
self.max_capture_size = (
|
||||
self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
|
||||
)
|
||||
# make sure the sizes are in ascending order
|
||||
self.cudagraph_capture_sizes.sort()
|
||||
if self.cudagraph_capture_sizes:
|
||||
assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
|
||||
|
||||
# pre-compute the mapping from batch size to padded graph size
|
||||
self.bs_to_padded_graph_size = [0 for i in range(self.max_capture_size + 1)]
|
||||
self.bs_to_padded_graph_size = [
|
||||
0 for i in range(self.max_cudagraph_capture_size + 1)
|
||||
]
|
||||
for end, start in zip(
|
||||
self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]
|
||||
self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
|
||||
[0] + self.cudagraph_capture_sizes,
|
||||
):
|
||||
for bs in range(start, end):
|
||||
if bs == start:
|
||||
self.bs_to_padded_graph_size[bs] = start
|
||||
else:
|
||||
self.bs_to_padded_graph_size[bs] = end
|
||||
self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
|
||||
|
||||
def set_splitting_ops_for_v1(self):
|
||||
# NOTE: this function needs to be called only when mode is
|
||||
|
||||
Reference in New Issue
Block a user