Add memory analyzer & utomatically configure KV cache size (#6)

This commit is contained in:
Woosuk Kwon
2023-03-11 23:23:14 -08:00
committed by GitHub
parent 1a7eb7da61
commit e9d3f2ff77
7 changed files with 216 additions and 34 deletions

View File

@@ -9,8 +9,6 @@ from cacheflow.sequence import SequenceGroupInputs
from cacheflow.sequence import SequenceOutputs
from cacheflow.sequence import SequenceStatus
_MAX_NUM_BATCHED_TOKENS = 2048
class Scheduler:
@@ -21,12 +19,14 @@ class Scheduler:
block_size: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
max_num_batched_tokens: int,
) -> None:
self.frontend = frontend
self.controllers = controllers
self.block_size = block_size
self.num_gpu_blocks = num_gpu_blocks
self.num_cpu_blocks = num_cpu_blocks
self.max_num_batched_tokens = max_num_batched_tokens
# Create the block space manager.
self.block_manager = BlockSpaceManager(
@@ -164,7 +164,7 @@ class Scheduler:
num_prompt_tokens = seq_group.seqs[0].get_len()
if self.block_manager.can_allocate(seq_group):
if (num_batched_tokens + num_prompt_tokens
<= _MAX_NUM_BATCHED_TOKENS):
<= self.max_num_batched_tokens):
self._allocate(seq_group)
num_batched_tokens += num_prompt_tokens
continue