Enable V1 for Hybrid SSM/Attention Models (#20016)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Thomas Parnell
2025-07-04 19:46:53 +02:00
committed by GitHub
parent ffe00ef77a
commit 2f35a022e6
14 changed files with 399 additions and 134 deletions

View File

@@ -159,6 +159,7 @@ class SlidingWindowSpec(AttentionSpec):
class MambaSpec(KVCacheSpec):
shapes: tuple[tuple[int, ...], ...]
dtype: torch.dtype
page_size_padded: Optional[int] = None
def __post_init__(self):
self.num_elements = sum(prod(shape) for shape in self.shapes)
@@ -169,7 +170,11 @@ class MambaSpec(KVCacheSpec):
@property
def page_size_bytes(self) -> int:
return self.num_elements * get_dtype_size(self.dtype)
page_size = self.num_elements * get_dtype_size(self.dtype)
if self.page_size_padded is not None:
assert self.page_size_padded >= page_size
return self.page_size_padded
return page_size
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
# We allocate 1 block for each request now, so max_memory_usage_bytes is