diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 01295e848..ea4ec8a62 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -194,7 +194,7 @@ def test_prefill_plp(): all_token_ids = common_token_ids + unique_token_ids req0 = make_request("0", all_token_ids, prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 3 + assert len(manager.req_to_block_hashes[req0.request_id]) == 0 assert not computed_blocks.blocks assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, computed_blocks) @@ -256,7 +256,7 @@ def test_prefill_plp(): common_token_ids + unique_token_ids, prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert len(manager.req_to_block_hashes[req2.request_id]) == 3 + assert len(manager.req_to_block_hashes[req2.request_id]) == 0 assert not computed_blocks.blocks assert num_computed_tokens == 0 blocks = manager.allocate_slots(req2, 55, computed_blocks) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 9e172b6bd..8ef8143d1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -126,8 +126,11 @@ class KVCacheManager: - A list of blocks that are computed for the request. - The number of computed tokens. """ - if not self.enable_caching: - # Prefix caching is disabled. + + # Prefix caching is disabled or + # When the request requires prompt logprobs, we skip prefix caching. + if (not self.enable_caching + or request.sampling_params.prompt_logprobs is not None): return KVCacheBlocks.create_empty(), 0 # The block hashes for the request may already be computed @@ -141,9 +144,6 @@ class KVCacheManager: if self.log_stats: assert self.prefix_cache_stats is not None self.prefix_cache_stats.requests += 1 - # When the request requires prompt logprobs, we skip prefix caching. - if request.sampling_params.prompt_logprobs is not None: - return KVCacheBlocks.create_empty(), 0 if len(block_hashes) * self.block_size == request.num_tokens: # When prompt length is divisible by the block size and all