[V1] Move more control of kv cache initialization from model_executor to EngineCore (#11960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-01-17 15:39:35 +08:00
parent 8027a72461
commit 69d765f5a5
12 changed files with 514 additions and 103 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional

 import torch
 import torch.distributed
@@ -16,6 +16,7 @@ from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner

@@ -112,20 +113,18 @@ class Worker:
        self.model_runner.load_model()

    @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
+    def determine_available_memory(self) -> int:
+        """Profiles the peak memory usage of the model to determine how much 
+        memory can be used for KV cache without OOMs.

        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
+        Then, it calculate the free memory that can be used for KV cache in
+        bytes.

        .. tip::
            You may limit the usage of GPU memory
            by adjusting the `gpu_memory_utilization` parameter.
        """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

@@ -161,33 +160,14 @@ class Worker:
            total_gpu_memory * self.cache_config.gpu_memory_utilization -
            peak_memory)

-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        cache_block_size = _get_cache_block_size(self.cache_config,
-                                                 self.model_config,
-                                                 self.parallel_config)
-        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        return num_gpu_blocks, 0
+        return int(available_kv_cache_memory)

-    def initialize_cache(self, num_gpu_blocks: int) -> None:
-        """Allocate GPU and CPU KV cache with the specified number of blocks."""
-        if num_gpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks. "
-                             "Try increasing `gpu_memory_utilization` when "
-                             "initializing the engine.")
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        return self.model_runner.get_kv_cache_spec()

-        max_seq_len = self.cache_config.block_size * num_gpu_blocks
-        max_model_len = self.model_config.max_model_len
-        if max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`gpu_memory_utilization` or decreasing `max_model_len` when "
-                "initializing the engine.")
-
-        self.model_runner.initialize_kv_cache(num_gpu_blocks)
+    def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+        self.model_runner.initialize_kv_cache(kv_cache_config)

    def compile_or_warm_up_model(self) -> None:
        if not self.model_config.enforce_eager: