[Core] support LoRA and prompt adapter in content-based hashing for Block Manager v2 prefix caching (#8240)
This commit is contained in:
@@ -5,7 +5,7 @@ from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.core.utils import create_dummy_sequence
|
||||
from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
|
||||
@@ -801,6 +801,7 @@ class TestPrefixCachingBlockAllocator:
|
||||
block_size: int,
|
||||
token_ids: List[int],
|
||||
allocator: PrefixCachingBlockAllocator,
|
||||
extra_hash: Optional[int] = None,
|
||||
) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
@@ -816,7 +817,9 @@ class TestPrefixCachingBlockAllocator:
|
||||
block_size:(block_number + 1) *
|
||||
block_size]
|
||||
prev_block = allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=block_token_ids)
|
||||
prev_block=prev_block,
|
||||
token_ids=block_token_ids,
|
||||
extra_hash=extra_hash)
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
@@ -931,3 +934,61 @@ class TestComputedBlocksTracker:
|
||||
allocator.mark_blocks_as_computed([])
|
||||
|
||||
assert tracker.get_num_cached_tokens(seq) == len(tokens)
|
||||
|
||||
@staticmethod
|
||||
def test_correct_extra_hash():
|
||||
"""
|
||||
Test that the block hash is correctly computed based on the extra hash,
|
||||
ensuring it matches the allocator's block hash, specifically for the
|
||||
LoRA case, and that the correct number of cached tokens is retrieved.
|
||||
"""
|
||||
block_size = 4
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="prefix_caching",
|
||||
num_gpu_blocks=16,
|
||||
num_cpu_blocks=16,
|
||||
block_size=block_size,
|
||||
)
|
||||
gpu_allocator = allocator._allocators[Device.GPU]
|
||||
|
||||
tracker = ComputedBlocksTracker(
|
||||
allocator=allocator,
|
||||
block_size=block_size,
|
||||
enable_caching=True,
|
||||
)
|
||||
|
||||
tokens = list(range(block_size * 4))
|
||||
|
||||
# Create a dummy LoRA sequence with a specific LoRA ID.
|
||||
lora_seq = create_dummy_lora_sequence(request_id=0,
|
||||
token_ids=tokens,
|
||||
block_size=block_size,
|
||||
lora_int_id=1)
|
||||
|
||||
_ = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=tokens,
|
||||
allocator=gpu_allocator,
|
||||
extra_hash=lora_seq.extra_hash(),
|
||||
)
|
||||
|
||||
allocator.mark_blocks_as_computed([])
|
||||
|
||||
# Create different dummy sequences that have the same token IDs
|
||||
# but different LoRA IDs.
|
||||
seq = create_dummy_sequence(request_id=1,
|
||||
token_ids=tokens,
|
||||
block_size=block_size)
|
||||
|
||||
different_lora_seq = create_dummy_lora_sequence(request_id=2,
|
||||
token_ids=tokens,
|
||||
block_size=block_size,
|
||||
lora_int_id=2)
|
||||
|
||||
# Due to the different LoRA IDs, corresponding blocks are not cached.
|
||||
assert tracker.get_num_cached_tokens(seq) == 0
|
||||
assert tracker.get_num_cached_tokens(different_lora_seq) == 0
|
||||
|
||||
# The number of cached tokens matches the length of the tokens
|
||||
# for the cached LoRA sequence.
|
||||
assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
|
||||
|
||||
Reference in New Issue
Block a user