[V1] Move more control of kv cache initialization from model_executor to EngineCore (#11960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-01-17 15:39:35 +08:00
parent 8027a72461
commit 69d765f5a5
12 changed files with 514 additions and 103 deletions
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -23,6 +23,7 @@ from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase

@@ -90,29 +91,33 @@ class MultiprocExecutor(Executor):
        for w in self.workers:
            w.worker_response_mq.wait_until_ready()

-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
        """
        Initialize the KV caches and begin the model execution loop of the
        underlying workers.
        """
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
        self.collective_rpc("compile_or_warm_up_model")

-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:
        """
-        Determine the number of available KV blocks by invoking the
+        Determine the available memory (in bytes) for KV cache by invoking the
        underlying worker.
        """
-        num_blocks = self.collective_rpc("determine_num_available_blocks")
+        memory_sizes = self.collective_rpc("determine_available_memory")

        # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
+        # memory size across all workers to make sure all the memory
        # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return min(memory_sizes)

-        return num_gpu_blocks, num_cpu_blocks
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Get all kv cache needed by the model by invoking the underlying worker.
+        """
+        kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
+        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
+        return kv_cache_specs[0]

    def collective_rpc(self,
                       method: str,