[Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-06 21:40:52 -04:00
parent f825c6bd22
commit 1dc8a70b6d
13 changed files with 369 additions and 213 deletions
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -417,12 +417,12 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
        return rnd_stride

    # Patch the attention backend class and re-trigger the KV cache creation.
-    for attn_backend in model_runner.attn_backends:
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
                            rnd_stride_order)

-    model_runner.attn_backends = []
-    model_runner.attn_metadata_builders = []
+    model_runner.attn_groups = []
    model_runner.initialize_kv_cache(model_runner.kv_cache_config)

    # Shape is unchanged, but layout may differ