[V1] Do not allocate beyond the max_model_len (#10730)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2024-11-28 00:13:15 -08:00
committed by GitHub
parent d9b4b3f069
commit a79b122400
3 changed files with 44 additions and 18 deletions

View File

@@ -23,7 +23,8 @@ def test_prefill():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
@@ -121,7 +122,8 @@ def test_decode():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
@@ -172,7 +174,8 @@ def test_evict():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
@@ -220,7 +223,8 @@ def test_hash_block_correct_reuse():
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=1,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
@@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted():
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=2,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
@@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled():
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=4,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=False,
num_preallocate_tokens=0,
)
@@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=10,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=num_preallocate_tokens,
)
@@ -370,7 +377,8 @@ def test_cache_blocks():
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=5,
sliding_window=False,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)