[PREFIX CACHING FOLLOW UP] A bunch of fixes to block allocator performance when automatic prefix caching is disabled (#3357)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
ElizaWszola
2024-03-20 08:11:11 +01:00
committed by GitHub
parent 20478c4d3a
commit 9474e89ba4
4 changed files with 171 additions and 127 deletions

View File

@@ -4,7 +4,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import pytest
from vllm.core.block_manager import BlockAllocator
from vllm.core.block_manager import CachedBlockAllocator
from vllm.utils import Device
@@ -15,10 +15,7 @@ def test_block_allocator(
num_blocks: int,
):
block_hash = 1
block_allocator = BlockAllocator(Device.CPU,
block_size,
num_blocks,
enable_caching=True)
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
# Allocate two PysicalTokenBlocks with the same hash and check
# that they are the same PhysicalTokenBlock
@@ -45,10 +42,7 @@ def test_block_allocator(
@pytest.mark.parametrize("num_blocks", [16])
def test_eviction(num_blocks: int, ):
block_size = 16
block_allocator = BlockAllocator(Device.CPU,
block_size,
num_blocks,
enable_caching=True)
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
blocks = []
for i in range(num_blocks):