[Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)

Signed-off-by: Lucia Fang <fanglu@fb.com> Signed-off-by: Lu Fang <fanglu@meta.com> Signed-off-by: Lu Fang <fanglu@fb.com> Co-authored-by: Lu Fang <fanglu@meta.com>
2025-07-19 11:48:38 +08:00
parent 466e878f2a
commit 9a9fda1423
9 changed files with 351 additions and 19 deletions
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -538,6 +538,7 @@ def use_cascade_attention(
    num_kv_heads: int,
    use_alibi: bool,
    use_sliding_window: bool,
+    use_local_attention: bool,
    num_sms: int,
 ) -> bool:
    """Decide whether to use cascade attention.
@@ -553,7 +554,7 @@ def use_cascade_attention(
    if common_prefix_len < 256:
        return False
    # Cascade attention is currently not supported with these variants.
-    if use_alibi or use_sliding_window:
+    if use_alibi or use_sliding_window or use_local_attention:
        return False
    # Too few queries. Probably not worth using cascade attention.
    # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -120,6 +120,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
        num_kv_heads: int,
        use_alibi: bool,
        use_sliding_window: bool,
+        use_local_attention: bool,
        num_sms: int,
    ) -> bool:
        return False