[v1] Hybrid Memory Allocator (#17996)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang
2025-06-06 11:47:09 +08:00
committed by GitHub
parent 3465b87ef8
commit f8a1a2d108
21 changed files with 1605 additions and 440 deletions

View File

@@ -2104,6 +2104,12 @@ class SchedulerConfig:
default scheduler. Can be a class directly or the path to a class of form
"mod.custom_class"."""
disable_hybrid_kv_cache_manager: bool = False
"""If set to True, KV cache manager will allocate the same size of KV cache
for all attention layers even if there are multiple type of attention layers
like full attention and sliding window attention.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
@@ -4465,6 +4471,21 @@ class VllmConfig:
if not self.instance_id:
self.instance_id = random_uuid()[:5]
if (envs.VLLM_USE_V1
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not (current_platform.is_cuda() or current_platform.is_rocm()):
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:
# Hybrid KV cache manager is not compatible with KV transfer.
self.disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
self.disable_hybrid_kv_cache_manager = True
def update_sizes_for_sequence_parallelism(self,
possible_sizes: list) -> list:
# remove the sizes that not multiple of tp_size when