[Attention] Refactor CUDA attention backend selection logic (#24794)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
@@ -185,9 +185,7 @@ def _make_mock_backend_for_kernel_block_size(
|
||||
supported_sizes: list[int | MultipleOf],
|
||||
):
|
||||
class _MockBackend:
|
||||
@staticmethod
|
||||
def get_supported_kernel_block_size():
|
||||
return supported_sizes
|
||||
supported_kernel_block_sizes = supported_sizes
|
||||
|
||||
return _MockBackend()
|
||||
|
||||
@@ -466,13 +464,20 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
|
||||
# This test checks if GPUModelRunner initializes correctly when an attention
|
||||
# backend enforces a non-default KV cache stride order.
|
||||
n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
|
||||
expected_kv_cache_shape = [
|
||||
2,
|
||||
NUM_BLOCKS,
|
||||
BLOCK_SIZE,
|
||||
n_heads,
|
||||
model_runner.model_config.get_head_size(),
|
||||
]
|
||||
head_size = model_runner.model_config.get_head_size()
|
||||
|
||||
# Get the expected shape from the backend's get_kv_cache_shape method
|
||||
# to ensure compatibility with different backends (triton vs flexattention)
|
||||
attn_backend = None
|
||||
for attn_group in model_runner._attn_group_iterator():
|
||||
attn_backend = attn_group.backend
|
||||
break
|
||||
|
||||
assert attn_backend is not None, "No attention backend found"
|
||||
expected_kv_cache_shape = list(
|
||||
attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
|
||||
)
|
||||
|
||||
# TODO mla test
|
||||
default_stride = tuple(range(5))
|
||||
# Permutation that gets you back to expected kv shape
|
||||
|
||||
Reference in New Issue
Block a user