[V0 deprecation] Clean up V0 fallback in compilation config (#25675)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -384,19 +384,7 @@ class VllmConfig:
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
if self.cache_config.cpu_offload_gb > 0 and \
|
||||
self.compilation_config.level != CompilationLevel.NO_COMPILATION \
|
||||
and not envs.VLLM_USE_V1:
|
||||
logger.warning(
|
||||
"CPU offload is not supported with `torch.compile` in v0 yet."
|
||||
" Disabling `torch.compile`.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
if self.cache_config.kv_sharing_fast_prefill:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Fast prefill optimization for KV sharing is not supported "
|
||||
"in V0 currently.")
|
||||
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
@@ -410,14 +398,6 @@ class VllmConfig:
|
||||
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||
"correctness and to realize prefill savings. ")
|
||||
|
||||
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
|
||||
and self.compilation_config.level
|
||||
!= CompilationLevel.NO_COMPILATION):
|
||||
logger.warning(
|
||||
"LoRA for V0 is not supported with `torch.compile` yet. "
|
||||
"Disabling `torch.compile`.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
disable_chunked_prefill_reasons: list[str] = []
|
||||
|
||||
if self.model_config:
|
||||
@@ -604,57 +584,27 @@ class VllmConfig:
|
||||
"""
|
||||
|
||||
# calculate the default `batch_size_capture_list`
|
||||
if not envs.VLLM_USE_V1:
|
||||
batch_size_capture_list = []
|
||||
if self.scheduler_config is not None and \
|
||||
self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
|
||||
possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
possible_sizes = self.update_sizes_for_sequence_parallelism(
|
||||
possible_sizes)
|
||||
|
||||
# find the minimum size that is larger than max_num_seqs,
|
||||
# which then becomes the max_batchsize_to_capture
|
||||
larger_sizes = [
|
||||
x for x in possible_sizes
|
||||
if x >= self.scheduler_config.max_num_seqs
|
||||
]
|
||||
if larger_sizes:
|
||||
max_batchsize_to_capture = larger_sizes[0]
|
||||
else:
|
||||
max_batchsize_to_capture = possible_sizes[-1]
|
||||
|
||||
# filter out the sizes that are
|
||||
# larger than max_batchsize_to_capture
|
||||
batch_size_capture_list = [
|
||||
size for size in possible_sizes
|
||||
if size <= max_batchsize_to_capture
|
||||
]
|
||||
else:
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
]
|
||||
|
||||
self.compilation_config.init_with_cudagraph_sizes(
|
||||
batch_size_capture_list)
|
||||
|
||||
Reference in New Issue
Block a user