[Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-06 21:40:52 -04:00
parent f825c6bd22
commit 1dc8a70b6d
13 changed files with 369 additions and 213 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -142,7 +142,7 @@ def get_attn_backend(
    dtype: torch.dtype,
    kv_cache_dtype: Optional[str],
    block_size: int,
-    is_attention_free: bool,
+    is_attention_free: bool = False,
    use_mla: bool = False,
 ) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""