[Hybrid] Mamba2 prefix cache blocks freeing for running requests (#28047)
Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -737,6 +737,14 @@ class MambaManager(SingleTypeKVCacheManager):
|
||||
)
|
||||
return super().allocate_new_blocks(request_id, num_tokens)
|
||||
|
||||
def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
|
||||
"""
|
||||
Get the number of tokens whose mamba state are not needed anymore. Mamba only
|
||||
need to keep the state of the last computed token, so we return
|
||||
num_computed_tokens - 1.
|
||||
"""
|
||||
return num_computed_tokens - 1
|
||||
|
||||
|
||||
class CrossAttentionManager(SingleTypeKVCacheManager):
|
||||
"""Manager for cross-attention KV cache in encoder-decoder models."""
|
||||
|
||||
Reference in New Issue
Block a user