feat(attention): extract KV-cache update from FlexAttention backend (#36263)

Signed-off-by: cong-or <conchubhar.gannon@gmail.com>
2026-03-09 03:40:12 +00:00
parent d62856b928
commit 747431044d
1 changed files with 25 additions and 11 deletions
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -82,6 +82,8 @@ class FlexAttentionBackend(AttentionBackend):
    ]
    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "bfloat16"]

+    forward_includes_kv_cache_update: bool = False
+
    @staticmethod
    def get_name() -> str:
        return "FLEX_ATTENTION"
@@ -827,6 +829,29 @@ class FlexAttentionImpl(AttentionImpl):
        assert tensor.ndim == 3
        return tensor[None, :, :, :]

+    def do_kv_cache_update(
+        self,
+        layer: torch.nn.Module,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> None:
+        if self.attn_type == AttentionType.ENCODER_ONLY:
+            return
+
+        key_cache, value_cache = kv_cache.unbind(0)
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
    def forward(
        self,
        layer: torch.nn.Module,
@@ -908,17 +933,6 @@ class FlexAttentionImpl(AttentionImpl):
            assert self.attn_type == AttentionType.DECODER
            key_cache, value_cache = kv_cache.unbind(0)

-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
            # View out the block_size dim
            key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
            value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)