[Core][v1] Unify allocating slots in prefill and decode in KV cache manager (#12608)
As mentioned in RFC https://github.com/vllm-project/vllm/issues/12254, this PR achieves the task: combine allocate_slots and append_slots. There should be no functionality change, except that in decode, also raise exception when num_tokens is zero (like prefill), and change the unit test case accordingly. @comaniac @rickyyx @WoosukKwon @youkaichao @heheda12345 @simon-mo --------- Signed-off-by: Shawn Du <shawnd200@outlook.com>
This commit is contained in:
@@ -164,7 +164,7 @@ def test_decode():
|
||||
req0.num_computed_tokens = 55
|
||||
for _ in range(4):
|
||||
req0.append_output_token_ids(8)
|
||||
new_blocks = manager.append_slots(req0, 4)
|
||||
new_blocks = manager.allocate_slots(req0, 4)
|
||||
assert new_blocks is not None and len(new_blocks) == 0
|
||||
assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
|
||||
|
||||
@@ -175,7 +175,7 @@ def test_decode():
|
||||
# the preallocated block.
|
||||
for _ in range(5 + 10):
|
||||
req0.append_output_token_ids(7)
|
||||
new_blocks = manager.append_slots(req0, 15)
|
||||
new_blocks = manager.allocate_slots(req0, 15)
|
||||
assert new_blocks is not None and len(new_blocks) == 0
|
||||
assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
|
||||
|
||||
@@ -185,7 +185,7 @@ def test_decode():
|
||||
# the preallocated block.
|
||||
for _ in range(6 + 11):
|
||||
req0.append_output_token_ids(12)
|
||||
new_blocks = manager.append_slots(req0, 17)
|
||||
new_blocks = manager.allocate_slots(req0, 17)
|
||||
# Plus one preallocated block.
|
||||
assert new_blocks is not None and len(new_blocks) == 2
|
||||
|
||||
@@ -395,12 +395,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
|
||||
req.num_computed_tokens = block_size
|
||||
assert len(blocks) == 1 + num_preallocated_blocks
|
||||
|
||||
# Assume all computed.
|
||||
manager.append_slots(req, block_size * (len(blocks) - 1))
|
||||
req.num_computed_tokens = block_size * len(blocks)
|
||||
# Assume all computed, only when num_preallocate_tokens > 0, we need to
|
||||
# consume the previously preallocated blocks.
|
||||
if num_preallocated_blocks > 0:
|
||||
manager.allocate_slots(req, block_size * (len(blocks) - 1))
|
||||
req.num_computed_tokens = block_size * len(blocks)
|
||||
|
||||
# Append 1 block.
|
||||
blocks = manager.append_slots(req, block_size)
|
||||
blocks = manager.allocate_slots(req, block_size)
|
||||
assert len(blocks) == 1 + num_preallocated_blocks
|
||||
|
||||
|
||||
@@ -503,7 +505,7 @@ def test_mm_prefix_caching():
|
||||
# Append slots without allocating a new block.
|
||||
for _ in range(5):
|
||||
req0.append_output_token_ids(8)
|
||||
new_blocks = manager.append_slots(req0, 5)
|
||||
new_blocks = manager.allocate_slots(req0, 5)
|
||||
assert new_blocks is not None and len(new_blocks) == 0
|
||||
|
||||
# The just completed block should have hashes with extra keys.
|
||||
@@ -603,7 +605,7 @@ def test_reset_prefix_cache():
|
||||
unique_token_ids = [3] * 7
|
||||
all_token_ids = full_block_token_ids + unique_token_ids
|
||||
req0 = make_request("0", all_token_ids)
|
||||
blocks = manager.allocate_slots(req0, 55, [])
|
||||
blocks = manager.allocate_slots(req0, 55)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3]
|
||||
|
||||
unique_token_ids = [4] * 7
|
||||
@@ -639,7 +641,7 @@ def test_uncache_blocks():
|
||||
)
|
||||
|
||||
req0 = make_request("0", list(range(30)))
|
||||
blocks = manager.allocate_slots(req0, 30, [])
|
||||
blocks = manager.allocate_slots(req0, 30)
|
||||
assert [b.block_id for b in blocks] == [0, 1]
|
||||
assert len(manager.cached_block_hash_to_block) == 1
|
||||
|
||||
@@ -648,7 +650,7 @@ def test_uncache_blocks():
|
||||
# Simulate speculative tokens.
|
||||
for _ in range(5):
|
||||
req0.append_output_token_ids(8)
|
||||
manager.append_slots(req0, 5)
|
||||
manager.allocate_slots(req0, 5)
|
||||
assert len(manager.cached_block_hash_to_block) == 2
|
||||
|
||||
# After sampling, assuming only 1 token is accepted.
|
||||
|
||||
Reference in New Issue
Block a user