[Attention] Refactor CUDA attention backend selection logic (#24794)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Matthew Bonanni
2025-11-11 06:40:44 -06:00
committed by GitHub
parent 2e78150d24
commit b30dfa03c5
61 changed files with 1338 additions and 1002 deletions

View File

@@ -185,9 +185,7 @@ def _make_mock_backend_for_kernel_block_size(
supported_sizes: list[int | MultipleOf],
):
class _MockBackend:
@staticmethod
def get_supported_kernel_block_size():
return supported_sizes
supported_kernel_block_sizes = supported_sizes
return _MockBackend()
@@ -466,13 +464,20 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
# This test checks if GPUModelRunner initializes correctly when an attention
# backend enforces a non-default KV cache stride order.
n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
expected_kv_cache_shape = [
2,
NUM_BLOCKS,
BLOCK_SIZE,
n_heads,
model_runner.model_config.get_head_size(),
]
head_size = model_runner.model_config.get_head_size()
# Get the expected shape from the backend's get_kv_cache_shape method
# to ensure compatibility with different backends (triton vs flexattention)
attn_backend = None
for attn_group in model_runner._attn_group_iterator():
attn_backend = attn_group.backend
break
assert attn_backend is not None, "No attention backend found"
expected_kv_cache_shape = list(
attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
)
# TODO mla test
default_stride = tuple(range(5))
# Permutation that gets you back to expected kv shape