[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
This commit is contained in:
@@ -949,7 +949,6 @@ class SchedulerConfig:
|
||||
iteration.
|
||||
max_model_len: Maximum length of a sequence (including prompt
|
||||
and generated text).
|
||||
use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
|
||||
num_lookahead_slots: The number of slots to allocate per sequence per
|
||||
step, beyond the known token ids. This is used in speculative
|
||||
decoding to store KV activations of tokens which may or may not be
|
||||
@@ -976,7 +975,6 @@ class SchedulerConfig:
|
||||
max_num_batched_tokens: Optional[int],
|
||||
max_num_seqs: int,
|
||||
max_model_len: int,
|
||||
use_v2_block_manager: bool = True,
|
||||
num_lookahead_slots: int = 0,
|
||||
delay_factor: float = 0.0,
|
||||
enable_chunked_prefill: bool = False,
|
||||
@@ -1026,7 +1024,6 @@ class SchedulerConfig:
|
||||
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.max_model_len = max_model_len
|
||||
self.use_v2_block_manager = use_v2_block_manager
|
||||
self.num_lookahead_slots = num_lookahead_slots
|
||||
self.delay_factor = delay_factor
|
||||
self.chunked_prefill_enabled = enable_chunked_prefill
|
||||
@@ -1067,18 +1064,6 @@ class SchedulerConfig:
|
||||
f"({self.num_scheduler_steps}) must be greater than or "
|
||||
"equal to 1.")
|
||||
|
||||
if (not self.use_v2_block_manager \
|
||||
and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
|
||||
raise ValueError(
|
||||
"The use of BlockSpaceManagerV1 is deprecated and will "
|
||||
"be removed in a future release. Please switch to "
|
||||
"BlockSpaceManagerV2 by setting --use-v2-block-manager to "
|
||||
"True. If you wish to suppress this error temporarily, "
|
||||
"you can set the environment variable "
|
||||
"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. If your use "
|
||||
"case is not supported in BlockSpaceManagerV2, please "
|
||||
"file an issue with detailed information.")
|
||||
|
||||
@property
|
||||
def is_multi_step(self) -> bool:
|
||||
return self.num_scheduler_steps > 1
|
||||
@@ -1137,7 +1122,6 @@ class SpeculativeConfig:
|
||||
speculative_disable_mqa_scorer: Optional[bool],
|
||||
speculative_max_model_len: Optional[int],
|
||||
enable_chunked_prefill: bool,
|
||||
use_v2_block_manager: bool,
|
||||
disable_log_stats: bool,
|
||||
speculative_disable_by_batch_size: Optional[int],
|
||||
ngram_prompt_lookup_max: Optional[int],
|
||||
@@ -1178,9 +1162,6 @@ class SpeculativeConfig:
|
||||
enable_chunked_prefill (bool): Whether vLLM is configured to use
|
||||
chunked prefill or not. Used for raising an error since its not
|
||||
yet compatible with spec decode.
|
||||
use_v2_block_manager (bool): Whether vLLM is configured to use the
|
||||
v2 block manager or not. Used for raising an error since the v2
|
||||
block manager is required with spec decode.
|
||||
speculative_disable_by_batch_size (Optional[int]): Disable
|
||||
speculative decoding for new incoming requests when the number
|
||||
of enqueue requests is larger than this value, if provided.
|
||||
@@ -1231,11 +1212,6 @@ class SpeculativeConfig:
|
||||
"Speculative decoding and chunked prefill are "
|
||||
f"currently mutually exclusive ({enable_chunked_prefill=}).")
|
||||
|
||||
if not use_v2_block_manager:
|
||||
raise ValueError(
|
||||
"Speculative decoding requires usage of the V2 "
|
||||
"block manager. Enable it with --use-v2-block-manager.")
|
||||
|
||||
# TODO: The user should be able to specify revision/max model len
|
||||
# for the draft model. It is not currently supported.
|
||||
draft_revision = None
|
||||
|
||||
Reference in New Issue
Block a user