[Compile] Conditional compilation. Introduce compile_ranges (#24252)
Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Luka Govedič <luka.govedic@gmail.com> Signed-off-by: ProExpertProg <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import torch.nn as nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import CUDAGraphMode, VllmConfig
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.distributed import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
@@ -407,15 +408,31 @@ class Worker(WorkerBase):
|
||||
self.model_runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
def compile_or_warm_up_model(self) -> None:
|
||||
# warm up sizes that are not in cudagraph capture sizes,
|
||||
# but users still want to compile for better performance,
|
||||
# e.g. for the max-num-batched token size in chunked prefill.
|
||||
compile_sizes = self.vllm_config.compilation_config.compile_sizes
|
||||
warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
|
||||
if not self.model_config.enforce_eager:
|
||||
capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
|
||||
if capture_sizes is not None:
|
||||
warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
|
||||
warmup_sizes = []
|
||||
|
||||
if self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
|
||||
# warm up sizes that are not in cudagraph capture sizes,
|
||||
# but users still want to compile for better performance,
|
||||
# e.g. for the max-num-batched token size in chunked prefill.
|
||||
compile_sizes = self.vllm_config.compilation_config.compile_sizes
|
||||
warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
|
||||
cg_capture_sizes: list[int] = []
|
||||
|
||||
if self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
|
||||
cg_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
|
||||
cg_capture_sizes = [] if cg_sizes is None else cg_sizes
|
||||
warmup_sizes = [x for x in warmup_sizes if x not in cg_capture_sizes]
|
||||
|
||||
compile_ranges = self.vllm_config.compilation_config.get_compile_ranges()
|
||||
# For each compile_range, if none of the batch sizes
|
||||
# in warmup_sizes or cudagraph_capture_sizes are in the range,
|
||||
# add the end of the range to ensure compilation/warmup.
|
||||
all_sizes = set(cg_capture_sizes)
|
||||
all_sizes.update([x for x in warmup_sizes if isinstance(x, int)])
|
||||
for compile_range in compile_ranges:
|
||||
if not any(x in compile_range for x in all_sizes):
|
||||
warmup_sizes.append(compile_range.end)
|
||||
|
||||
# We skip EPLB here since we don't want to record dummy metrics
|
||||
for size in sorted(warmup_sizes, reverse=True):
|
||||
logger.info("Compile and warming up model for size %d", size)
|
||||
|
||||
Reference in New Issue
Block a user