[V1] Remove pre-allocation for KV cache (#16941)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-22 00:52:18 -07:00
parent 2689d5c027
commit c4ab9f3e71
5 changed files with 61 additions and 141 deletions
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -804,20 +804,17 @@ def _assert_right_kv_cache_manager(
    """Check whether KVCacheManager is correct after allocate."""

    # Make sure the request stats are right.
-    EXPECTED_ACTUAL_BLOCKS = num_tokens // block_size
-    EXPECTED_TOTAL_BLOCKS = (EXPECTED_ACTUAL_BLOCKS +
-                             scheduler.kv_cache_manager.num_preallocate_blocks)
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
    for req_id in req_ids:
        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
-                EXPECTED_ACTUAL_BLOCKS)
+                EXPECTED_TOTAL_BLOCKS)
        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
-        assert len(hashes) == EXPECTED_ACTUAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS

    # Make sure we actually touched all the blocks.
-    BLOCKS_PER_REQ = (num_tokens / block_size +
-                      scheduler.kv_cache_manager.num_preallocate_blocks)
+    BLOCKS_PER_REQ = num_tokens / block_size
    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
            num_total_blocks - num_requests * BLOCKS_PER_REQ)

@@ -1052,7 +1049,6 @@ def test_kv_connector_handles_preemption():
        block_size=BLOCK_SIZE,
        num_blocks=NUM_BLOCKS,
    )
-    scheduler.kv_cache_manager.num_preallocate_blocks = 0

    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")