[Bugfix] Disable cross-layer KV cache for MLA attention backends (#37090)
Signed-off-by: haosdent <haosdent@gmail.com> Co-authored-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
@@ -63,6 +63,9 @@ class DeepseekV32IndexerBackend(AttentionBackend):
|
||||
include_num_layers_dimension: bool = False,
|
||||
) -> tuple[int, ...]:
|
||||
if include_num_layers_dimension:
|
||||
# DeepseekV32Indexer kernels do not support cross-layer
|
||||
# KV cache layout. Identity permutation keeps num_layers
|
||||
# first, signaling incompatibility.
|
||||
return (0, 1, 2, 3)
|
||||
return (0, 1, 2)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user