[Bugfix] Disable cross-layer KV cache for MLA attention backends (#37090)

Signed-off-by: haosdent <haosdent@gmail.com> Co-authored-by: Or Ozeri <oro@il.ibm.com>
2026-03-17 01:03:10 +08:00
parent 55e6d3d5c0
commit ca1954d58c
5 changed files with 56 additions and 8 deletions
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1142,10 +1142,12 @@ class MLACommonBackend(AttentionBackend):
    def get_kv_cache_stride_order(
        include_num_layers_dimension: bool = False,
    ) -> tuple[int, ...]:
-        # `stride_order` indicates the permutation that gets
-        # us from `get_kv_cache_shape` to the actual memory layout we want.
-        # (num_blocks, num_layers, block_size, head_size)
-        return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2)
+        if include_num_layers_dimension:
+            # MLA kernels require contiguous per-layer KV cache views.
+            # Identity permutation keeps num_layers first in physical
+            # layout, signaling cross-layer allocation is unsupported.
+            return (0, 1, 2, 3)
+        return (0, 1, 2)

    @classmethod
    def get_supported_head_sizes(cls) -> list[int]: