[Core][v1] Unify allocating slots in prefill and decode in KV cache manager (#12608)

As mentioned in RFC https://github.com/vllm-project/vllm/issues/12254, this PR achieves the task: combine allocate_slots and append_slots. There should be no functionality change, except that in decode, also raise exception when num_tokens is zero (like prefill), and change the unit test case accordingly. @comaniac @rickyyx @WoosukKwon @youkaichao @heheda12345 @simon-mo --------- Signed-off-by: Shawn Du <shawnd200@outlook.com>
2025-02-02 16:40:58 +08:00
parent abfcdcdf27
commit f8ece6e17f
3 changed files with 78 additions and 116 deletions
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -164,7 +164,7 @@ def test_decode():
    req0.num_computed_tokens = 55
    for _ in range(4):
        req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 4)
+    new_blocks = manager.allocate_slots(req0, 4)
    assert new_blocks is not None and len(new_blocks) == 0
    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None

@@ -175,7 +175,7 @@ def test_decode():
    # the preallocated block.
    for _ in range(5 + 10):
        req0.append_output_token_ids(7)
-    new_blocks = manager.append_slots(req0, 15)
+    new_blocks = manager.allocate_slots(req0, 15)
    assert new_blocks is not None and len(new_blocks) == 0
    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None

@@ -185,7 +185,7 @@ def test_decode():
    # the preallocated block.
    for _ in range(6 + 11):
        req0.append_output_token_ids(12)
-    new_blocks = manager.append_slots(req0, 17)
+    new_blocks = manager.allocate_slots(req0, 17)
    # Plus one preallocated block.
    assert new_blocks is not None and len(new_blocks) == 2

@@ -395,12 +395,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    req.num_computed_tokens = block_size
    assert len(blocks) == 1 + num_preallocated_blocks

-    # Assume all computed.
-    manager.append_slots(req, block_size * (len(blocks) - 1))
-    req.num_computed_tokens = block_size * len(blocks)
+    # Assume all computed, only when num_preallocate_tokens > 0, we need to
+    # consume the previously preallocated blocks.
+    if num_preallocated_blocks > 0:
+        manager.allocate_slots(req, block_size * (len(blocks) - 1))
+        req.num_computed_tokens = block_size * len(blocks)

    # Append 1 block.
-    blocks = manager.append_slots(req, block_size)
+    blocks = manager.allocate_slots(req, block_size)
    assert len(blocks) == 1 + num_preallocated_blocks


@@ -503,7 +505,7 @@ def test_mm_prefix_caching():
    # Append slots without allocating a new block.
    for _ in range(5):
        req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 5)
+    new_blocks = manager.allocate_slots(req0, 5)
    assert new_blocks is not None and len(new_blocks) == 0

    # The just completed block should have hashes with extra keys.
@@ -603,7 +605,7 @@ def test_reset_prefix_cache():
    unique_token_ids = [3] * 7
    all_token_ids = full_block_token_ids + unique_token_ids
    req0 = make_request("0", all_token_ids)
-    blocks = manager.allocate_slots(req0, 55, [])
+    blocks = manager.allocate_slots(req0, 55)
    assert [b.block_id for b in blocks] == [0, 1, 2, 3]

    unique_token_ids = [4] * 7
@@ -639,7 +641,7 @@ def test_uncache_blocks():
    )

    req0 = make_request("0", list(range(30)))
-    blocks = manager.allocate_slots(req0, 30, [])
+    blocks = manager.allocate_slots(req0, 30)
    assert [b.block_id for b in blocks] == [0, 1]
    assert len(manager.cached_block_hash_to_block) == 1

@@ -648,7 +650,7 @@ def test_uncache_blocks():
    # Simulate speculative tokens.
    for _ in range(5):
        req0.append_output_token_ids(8)
-    manager.append_slots(req0, 5)
+    manager.allocate_slots(req0, 5)
    assert len(manager.cached_block_hash_to_block) == 2

    # After sampling, assuming only 1 token is accepted.