Enable V1 for Hybrid SSM/Attention Models (#20016)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-07-04 19:46:53 +02:00
parent ffe00ef77a
commit 2f35a022e6
14 changed files with 399 additions and 134 deletions
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -159,6 +159,7 @@ class SlidingWindowSpec(AttentionSpec):
 class MambaSpec(KVCacheSpec):
    shapes: tuple[tuple[int, ...], ...]
    dtype: torch.dtype
+    page_size_padded: Optional[int] = None

    def __post_init__(self):
        self.num_elements = sum(prod(shape) for shape in self.shapes)
@@ -169,7 +170,11 @@ class MambaSpec(KVCacheSpec):

    @property
    def page_size_bytes(self) -> int:
-        return self.num_elements * get_dtype_size(self.dtype)
+        page_size = self.num_elements * get_dtype_size(self.dtype)
+        if self.page_size_padded is not None:
+            assert self.page_size_padded >= page_size
+            return self.page_size_padded
+        return page_size

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        # We allocate 1 block for each request now, so max_memory_usage_bytes is