[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
This commit is contained in:
@@ -373,12 +373,13 @@ class EngineArgs:
|
||||
action='store_true',
|
||||
help='Disables sliding window, '
|
||||
'capping to sliding window size')
|
||||
parser.add_argument(
|
||||
'--use-v2-block-manager',
|
||||
default=EngineArgs.use_v2_block_manager,
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2. By default this is set to True. '
|
||||
'Set to False to use BlockSpaceManagerV1')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='[DEPRECATED] block manager v1 has been '
|
||||
'removed and SelfAttnBlockSpaceManager (i.e. '
|
||||
'block manager v2) is now the default. '
|
||||
'Setting this flag to True or False'
|
||||
' has no effect on vLLM behavior.')
|
||||
parser.add_argument(
|
||||
'--num-lookahead-slots',
|
||||
type=int,
|
||||
@@ -969,12 +970,6 @@ class EngineArgs:
|
||||
"in low performance due to small KV cache space. Consider "
|
||||
"setting --max-model-len to a smaller value.", max_model_len)
|
||||
|
||||
if self.num_scheduler_steps > 1 and not self.use_v2_block_manager:
|
||||
self.use_v2_block_manager = True
|
||||
logger.warning(
|
||||
"Enabled BlockSpaceManagerV2 because it is "
|
||||
"required for multi-step (--num-scheduler-steps > 1)")
|
||||
|
||||
speculative_config = SpeculativeConfig.maybe_create_spec_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
@@ -990,7 +985,6 @@ class EngineArgs:
|
||||
speculative_disable_by_batch_size,
|
||||
speculative_max_model_len=self.speculative_max_model_len,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
use_v2_block_manager=self.use_v2_block_manager,
|
||||
disable_log_stats=self.disable_log_stats,
|
||||
ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
|
||||
ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
|
||||
@@ -1021,11 +1015,20 @@ class EngineArgs:
|
||||
if speculative_config is None \
|
||||
else speculative_config.num_lookahead_slots
|
||||
|
||||
if not self.use_v2_block_manager:
|
||||
logger.warning(
|
||||
"[DEPRECATED] Block manager v1 has been removed, "
|
||||
"and setting --use-v2-block-manager to True or False has "
|
||||
"no effect on vLLM behavior. Please remove "
|
||||
"--use-v2-block-manager in your engine argument. "
|
||||
"If your use case is not supported by "
|
||||
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
|
||||
" please file an issue with detailed information.")
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||
max_num_seqs=self.max_num_seqs,
|
||||
max_model_len=model_config.max_model_len,
|
||||
use_v2_block_manager=self.use_v2_block_manager,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
delay_factor=self.scheduler_delay_factor,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
@@ -1081,13 +1084,6 @@ class EngineArgs:
|
||||
or "all" in detailed_trace_modules,
|
||||
)
|
||||
|
||||
if (model_config.get_sliding_window() is not None
|
||||
and scheduler_config.chunked_prefill_enabled
|
||||
and not scheduler_config.use_v2_block_manager):
|
||||
raise ValueError(
|
||||
"Chunked prefill is not supported with sliding window. "
|
||||
"Set --disable-sliding-window to disable sliding window.")
|
||||
|
||||
return EngineConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
|
||||
Reference in New Issue
Block a user