[BUGFIX][Mamba][Qwen3.5] Zero freed SSM cache blocks on GPU (#35219)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
This commit is contained in:
Vadim Gimpelson
2026-03-10 14:32:20 +04:00
committed by GitHub
parent 507ddbe992
commit 4ff8c3c8f9
10 changed files with 287 additions and 8 deletions

View File

@@ -86,6 +86,26 @@ class AttentionBackend(ABC):
) -> tuple[int, ...]:
raise NotImplementedError
@classmethod
def get_kv_cache_block_dim(
cls,
block_size: int,
num_kv_heads: int,
head_size: int,
cache_dtype_str: str = "auto",
) -> int:
"""Discover which tensor dim is the block index, since different
backends lay out dims differently."""
_S = 1234567
shape = cls.get_kv_cache_shape(
_S,
block_size,
num_kv_heads,
head_size,
cache_dtype_str=cache_dtype_str,
)
return shape.index(_S)
@staticmethod
def get_kv_cache_stride_order(
include_num_layers_dimension: bool = False,