[V1] [Hybrid] Mamba2 Automatic Prefix Caching (#25752)

Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com> Signed-off-by: Thomas Ortner <boh@zurich.ibm.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Thomas Ortner <boh@zurich.ibm.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-04 06:34:22 +02:00
parent 9705fba7b7
commit ea507c3a93
18 changed files with 917 additions and 147 deletions
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -233,10 +233,8 @@ class MambaSpec(KVCacheSpec):
        return page_size

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
-        # We allocate 1 block for each request now, so max_memory_usage_bytes is
-        # the same as page_size_bytes.
-        # Need to update this when supporting prefix caching.
-        return self.page_size_bytes
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes


@dataclass(frozen=True)