[Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
This commit is contained in:
Kuntai Du
2025-10-24 23:34:18 -07:00
committed by GitHub
parent 56ed7609a9
commit b853540388
15 changed files with 113 additions and 18 deletions

View File

@@ -331,6 +331,15 @@ class Worker(WorkerBase):
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""
# Init kv cache connector here, because it requires
# `kv_cache_config`.
# NOTE(Kuntai): This need to be done before `initialize_kv_cache`,
# because `initialize_kv_cache` will inject kv cache groups not
# related to kv cache connector (e.g. kv cache sharing layers).
connector_vllm_config = copy.copy(self.vllm_config)
connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)
ensure_kv_transfer_initialized(connector_vllm_config)
if self.vllm_config.model_config.enable_sleep_mode:
from vllm.device_allocator.cumem import CuMemAllocator
@@ -783,5 +792,3 @@ def init_worker_distributed_environment(
parallel_config.pipeline_parallel_size,
parallel_config.decode_context_parallel_size,
)
ensure_kv_transfer_initialized(vllm_config)