[PREFIX CACHING FOLLOW UP] A bunch of fixes to block allocator performance when automatic prefix caching is disabled (#3357)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import List
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.block import PhysicalTokenBlock
|
||||
from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager,
|
||||
from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager,
|
||||
AllocStatus)
|
||||
from vllm.utils import Device
|
||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
|
||||
@@ -15,7 +15,8 @@ from .utils import create_dummy_prompt
|
||||
def test_block_allocator_allocate():
|
||||
block_size = 4
|
||||
num_cpu_blocks = 4
|
||||
cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
|
||||
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
|
||||
num_cpu_blocks)
|
||||
|
||||
# Allocate all available cpu blocks.
|
||||
num_free = num_cpu_blocks
|
||||
@@ -24,7 +25,7 @@ def test_block_allocator_allocate():
|
||||
block = cpu_allocator.allocate()
|
||||
num_free -= 1
|
||||
|
||||
assert block.block_hash not in cpu_allocator.evictor
|
||||
assert block not in cpu_allocator.free_blocks
|
||||
assert cpu_allocator.get_num_free_blocks() == num_free
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
@@ -34,14 +35,15 @@ def test_block_allocator_allocate():
|
||||
def test_block_allocator_free():
|
||||
block_size = 4
|
||||
num_cpu_blocks = 4
|
||||
cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
|
||||
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
|
||||
num_cpu_blocks)
|
||||
|
||||
# Allocate all available cpu blocks.
|
||||
blocks: List[PhysicalTokenBlock] = []
|
||||
for _ in range(num_cpu_blocks):
|
||||
block = cpu_allocator.allocate()
|
||||
blocks.append(block)
|
||||
assert block.block_hash not in cpu_allocator.evictor
|
||||
assert block not in cpu_allocator.free_blocks
|
||||
|
||||
# Free all allocated cpu blocks.
|
||||
num_free = 0
|
||||
@@ -49,7 +51,7 @@ def test_block_allocator_free():
|
||||
for block in blocks:
|
||||
cpu_allocator.free(block)
|
||||
num_free += 1
|
||||
assert block.block_hash in cpu_allocator.evictor
|
||||
assert block in cpu_allocator.free_blocks
|
||||
assert cpu_allocator.get_num_free_blocks() == num_free
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
|
||||
@@ -4,7 +4,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm.core.block_manager import BlockAllocator
|
||||
from vllm.core.block_manager import CachedBlockAllocator
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
@@ -15,10 +15,7 @@ def test_block_allocator(
|
||||
num_blocks: int,
|
||||
):
|
||||
block_hash = 1
|
||||
block_allocator = BlockAllocator(Device.CPU,
|
||||
block_size,
|
||||
num_blocks,
|
||||
enable_caching=True)
|
||||
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
|
||||
|
||||
# Allocate two PysicalTokenBlocks with the same hash and check
|
||||
# that they are the same PhysicalTokenBlock
|
||||
@@ -45,10 +42,7 @@ def test_block_allocator(
|
||||
@pytest.mark.parametrize("num_blocks", [16])
|
||||
def test_eviction(num_blocks: int, ):
|
||||
block_size = 16
|
||||
block_allocator = BlockAllocator(Device.CPU,
|
||||
block_size,
|
||||
num_blocks,
|
||||
enable_caching=True)
|
||||
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
|
||||
blocks = []
|
||||
|
||||
for i in range(num_blocks):
|
||||
|
||||
Reference in New Issue
Block a user