From f1531d9f2a60c347ea1b410bfb04850c92922de0 Mon Sep 17 00:00:00 2001 From: Stan Wozniak <77159600+s3woz@users.noreply.github.com> Date: Thu, 25 Dec 2025 21:54:06 +0100 Subject: [PATCH] [Hybrid] Mamba2 prefix cache blocks freeing for running requests (#28047) Signed-off-by: Stanislaw Wozniak Signed-off-by: Chen Zhang Co-authored-by: Chen Zhang --- vllm/v1/core/single_type_kv_cache_manager.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 4aeb17a15..e8a0a39b1 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -737,6 +737,14 @@ class MambaManager(SingleTypeKVCacheManager): ) return super().allocate_new_blocks(request_id, num_tokens) + def get_num_skipped_tokens(self, num_computed_tokens: int) -> int: + """ + Get the number of tokens whose mamba state are not needed anymore. Mamba only + need to keep the state of the last computed token, so we return + num_computed_tokens - 1. + """ + return num_computed_tokens - 1 + class CrossAttentionManager(SingleTypeKVCacheManager): """Manager for cross-attention KV cache in encoder-decoder models."""