[Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)

Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lu Fang <fanglu@meta.com>
This commit is contained in:
Lucia Fang
2025-07-19 11:48:38 +08:00
committed by GitHub
parent 466e878f2a
commit 9a9fda1423
9 changed files with 351 additions and 19 deletions

View File

@@ -538,6 +538,7 @@ def use_cascade_attention(
num_kv_heads: int,
use_alibi: bool,
use_sliding_window: bool,
use_local_attention: bool,
num_sms: int,
) -> bool:
"""Decide whether to use cascade attention.
@@ -553,7 +554,7 @@ def use_cascade_attention(
if common_prefix_len < 256:
return False
# Cascade attention is currently not supported with these variants.
if use_alibi or use_sliding_window:
if use_alibi or use_sliding_window or use_local_attention:
return False
# Too few queries. Probably not worth using cascade attention.
# We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.

View File

@@ -120,6 +120,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
num_kv_heads: int,
use_alibi: bool,
use_sliding_window: bool,
use_local_attention: bool,
num_sms: int,
) -> bool:
return False