[V1][Metrics] add support for kv event publishing (#16750)
Signed-off-by: alec-flowers <aflowers@nvidia.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
@@ -6,6 +6,7 @@ from typing import Optional
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import sha256
|
||||
@@ -48,9 +49,10 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
|
||||
num_blocks=num_blocks,
|
||||
tensors={},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(block_size, 1, 1, torch.float32,
|
||||
False))
|
||||
KVCacheGroupSpec(
|
||||
["layer"],
|
||||
FullAttentionSpec(block_size, 1, 1, torch.float32, False),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
@@ -783,6 +785,60 @@ def test_prefix_cache_stats_disabled():
|
||||
assert manager.prefix_cache_stats is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
|
||||
def test_kv_cache_events(blocks_to_cache: int):
|
||||
block_size = 16
|
||||
num_blocks = blocks_to_cache + 1
|
||||
|
||||
# Allocate Blocks
|
||||
# Should see a single block stored event with a blocks_to_cache number of
|
||||
# block hashes
|
||||
# take_events should reset the kv_event_queue
|
||||
manager = KVCacheManager(
|
||||
make_kv_cache_config(block_size, num_blocks),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
enable_kv_cache_events=True,
|
||||
)
|
||||
|
||||
num_tokens = block_size * blocks_to_cache
|
||||
req0 = make_request("0", list(range(num_tokens)))
|
||||
_ = manager.allocate_slots(req0, num_tokens)
|
||||
events = manager.take_events()
|
||||
|
||||
block = events[-1]
|
||||
assert (len(block.block_hashes) == blocks_to_cache == len(
|
||||
manager.block_pool.cached_block_hash_to_block))
|
||||
assert len(block.token_ids) == block.block_size * len(block.block_hashes)
|
||||
assert len(manager.block_pool.kv_event_queue) == 0
|
||||
|
||||
stored_block_hash = block.block_hashes
|
||||
|
||||
# Remove blocks and send another request
|
||||
# Should see block_to_cache number of removed block events and a new block
|
||||
# stored event
|
||||
manager.free(req0)
|
||||
req1 = make_request("1", list(range(num_tokens)))
|
||||
_ = manager.allocate_slots(req1, num_tokens)
|
||||
events = manager.take_events()
|
||||
|
||||
for blocks in events[:-1]:
|
||||
assert blocks.block_hashes[0] in stored_block_hash
|
||||
assert len(events) == blocks_to_cache + 1
|
||||
assert (isinstance(events[-2], BlockRemoved))
|
||||
assert (len(events[-1].block_hashes) == blocks_to_cache == len(
|
||||
manager.block_pool.cached_block_hash_to_block))
|
||||
|
||||
# All Blocks Cleared
|
||||
# Should see a single all blocks cleared event
|
||||
manager.free(req1)
|
||||
manager.reset_prefix_cache()
|
||||
events = manager.take_events()
|
||||
|
||||
assert isinstance(events[-1], AllBlocksCleared)
|
||||
assert len(manager.block_pool.cached_block_hash_to_block) == 0
|
||||
|
||||
|
||||
def test_eagle_enabled_removes_last_block():
|
||||
"""Verify Eagle does NOT remove blocks when request
|
||||
length is divisible by block size."""
|
||||
|
||||
Reference in New Issue
Block a user