[Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson
2025-08-06 21:40:52 -04:00
committed by GitHub
parent f825c6bd22
commit 1dc8a70b6d
13 changed files with 369 additions and 213 deletions

View File

@@ -417,12 +417,12 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
return rnd_stride
# Patch the attention backend class and re-trigger the KV cache creation.
for attn_backend in model_runner.attn_backends:
for attn_group in model_runner._attn_group_iterator():
attn_backend = attn_group.backend
monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
rnd_stride_order)
model_runner.attn_backends = []
model_runner.attn_metadata_builders = []
model_runner.attn_groups = []
model_runner.initialize_kv_cache(model_runner.kv_cache_config)
# Shape is unchanged, but layout may differ