[v1] Hybrid Memory Allocator (#17996)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -2104,6 +2104,12 @@ class SchedulerConfig:
|
||||
default scheduler. Can be a class directly or the path to a class of form
|
||||
"mod.custom_class"."""
|
||||
|
||||
disable_hybrid_kv_cache_manager: bool = False
|
||||
"""If set to True, KV cache manager will allocate the same size of KV cache
|
||||
for all attention layers even if there are multiple type of attention layers
|
||||
like full attention and sliding window attention.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
@@ -4465,6 +4471,21 @@ class VllmConfig:
|
||||
if not self.instance_id:
|
||||
self.instance_id = random_uuid()[:5]
|
||||
|
||||
if (envs.VLLM_USE_V1
|
||||
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
|
||||
# logger should only print warning message for hybrid models. As we
|
||||
# can't know whether the model is hybrid or not now, so we don't log
|
||||
# warning message here and will log it later.
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_transfer_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV transfer.
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_events_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV events.
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
# remove the sizes that not multiple of tp_size when
|
||||
|
||||
Reference in New Issue
Block a user