[CPU] Split attention dispatch by head_dim alignment (#32161)

Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2026-02-04 09:07:15 +05:30
parent e1bf04b6c2
commit 4dffc5e044
6 changed files with 241 additions and 107 deletions
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -26,6 +26,7 @@ NUM_HEADS = [
    (9, 3),
 ]
 HEAD_SIZES = [96, 128]
+HEAD_SIZES_VEC16 = [96, 80, 112, 128]
 QTYPES = [torch.bfloat16, torch.half, torch.float32]
 SLIDING_WINDOWS = [None, 256]
 NUM_BLOCKS = [
@@ -432,7 +433,7 @@ def test_varlen_with_paged_kv_normal_amx(

@pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES_VEC16)
@pytest.mark.parametrize("block_size", [48])
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@pytest.mark.parametrize("dtype", [torch.bfloat16])