[V1] Move more control of kv cache initialization from model_executor to EngineCore (#11960)
Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
@@ -23,6 +23,7 @@ from vllm.logger import init_logger
|
||||
from vllm.utils import (get_distributed_init_method, get_mp_context,
|
||||
get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
@@ -90,29 +91,33 @@ class MultiprocExecutor(Executor):
|
||||
for w in self.workers:
|
||||
w.worker_response_mq.wait_until_ready()
|
||||
|
||||
def initialize(self, num_gpu_blocks: int) -> None:
|
||||
def initialize(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
"""
|
||||
Initialize the KV caches and begin the model execution loop of the
|
||||
underlying workers.
|
||||
"""
|
||||
logger.info("# GPU blocks: %d", num_gpu_blocks)
|
||||
self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
|
||||
self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
|
||||
self.collective_rpc("compile_or_warm_up_model")
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
def determine_available_memory(self) -> int:
|
||||
"""
|
||||
Determine the number of available KV blocks by invoking the
|
||||
Determine the available memory (in bytes) for KV cache by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
num_blocks = self.collective_rpc("determine_num_available_blocks")
|
||||
memory_sizes = self.collective_rpc("determine_available_memory")
|
||||
|
||||
# Since we use a shared centralized controller, we take the minimum
|
||||
# number of blocks across all workers to make sure all the memory
|
||||
# memory size across all workers to make sure all the memory
|
||||
# operators can be applied to all workers.
|
||||
num_gpu_blocks = min(b[0] for b in num_blocks)
|
||||
num_cpu_blocks = min(b[1] for b in num_blocks)
|
||||
return min(memory_sizes)
|
||||
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
def get_kv_cache_spec(self) -> KVCacheSpec:
|
||||
"""
|
||||
Get all kv cache needed by the model by invoking the underlying worker.
|
||||
"""
|
||||
kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
|
||||
assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
|
||||
return kv_cache_specs[0]
|
||||
|
||||
def collective_rpc(self,
|
||||
method: str,
|
||||
|
||||
Reference in New Issue
Block a user