[Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu> Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-10-24 23:34:18 -07:00
parent 56ed7609a9
commit b853540388
15 changed files with 113 additions and 18 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -331,6 +331,15 @@ class Worker(WorkerBase):
    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
        """Allocate GPU KV cache with the specified kv_cache_config."""

+        # Init kv cache connector here, because it requires
+        # `kv_cache_config`.
+        # NOTE(Kuntai): This need to be done before `initialize_kv_cache`,
+        # because `initialize_kv_cache` will inject kv cache groups not
+        # related to kv cache connector (e.g. kv cache sharing layers).
+        connector_vllm_config = copy.copy(self.vllm_config)
+        connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)
+        ensure_kv_transfer_initialized(connector_vllm_config)
+
        if self.vllm_config.model_config.enable_sleep_mode:
            from vllm.device_allocator.cumem import CuMemAllocator

@@ -783,5 +792,3 @@ def init_worker_distributed_environment(
        parallel_config.pipeline_parallel_size,
        parallel_config.decode_context_parallel_size,
    )
-
-    ensure_kv_transfer_initialized(vllm_config)