[Attention] Make local attention backend agnostic (#21093)

2025-07-18 00:10:42 -04:00
parent b9a21e9173
commit 89cab4d01f
8 changed files with 94 additions and 242 deletions
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -125,6 +125,21 @@ class FullAttentionSpec(AttentionSpec):
        return merged_spec


+@dataclass
+class ChunkedLocalAttentionSpec(AttentionSpec):
+    attention_chunk_size: int
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+    @property
+    def type_id(self) -> str:
+        return (
+            f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}"
+        )  # noqa
+
+
@dataclass
 class SlidingWindowSpec(AttentionSpec):
    sliding_window: int