[V1] Move more control of kv cache initialization from model_executor to EngineCore (#11960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
Chen Zhang
2025-01-17 15:39:35 +08:00
committed by GitHub
parent 8027a72461
commit 69d765f5a5
12 changed files with 514 additions and 103 deletions

View File

@@ -23,6 +23,7 @@ from vllm.logger import init_logger
from vllm.utils import (get_distributed_init_method, get_mp_context,
get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import ModelRunnerOutput
from vllm.worker.worker_base import WorkerWrapperBase
@@ -90,29 +91,33 @@ class MultiprocExecutor(Executor):
for w in self.workers:
w.worker_response_mq.wait_until_ready()
def initialize(self, num_gpu_blocks: int) -> None:
def initialize(self, kv_cache_config: KVCacheConfig) -> None:
"""
Initialize the KV caches and begin the model execution loop of the
underlying workers.
"""
logger.info("# GPU blocks: %d", num_gpu_blocks)
self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
self.collective_rpc("compile_or_warm_up_model")
def determine_num_available_blocks(self) -> Tuple[int, int]:
def determine_available_memory(self) -> int:
"""
Determine the number of available KV blocks by invoking the
Determine the available memory (in bytes) for KV cache by invoking the
underlying worker.
"""
num_blocks = self.collective_rpc("determine_num_available_blocks")
memory_sizes = self.collective_rpc("determine_available_memory")
# Since we use a shared centralized controller, we take the minimum
# number of blocks across all workers to make sure all the memory
# memory size across all workers to make sure all the memory
# operators can be applied to all workers.
num_gpu_blocks = min(b[0] for b in num_blocks)
num_cpu_blocks = min(b[1] for b in num_blocks)
return min(memory_sizes)
return num_gpu_blocks, num_cpu_blocks
def get_kv_cache_spec(self) -> KVCacheSpec:
"""
Get all kv cache needed by the model by invoking the underlying worker.
"""
kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
return kv_cache_specs[0]
def collective_rpc(self,
method: str,