Add attention benchmarking tools (#26835)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Claude <noreply@anthropic.com>
2026-01-27 19:09:20 -05:00
parent d9aa39a3bb
commit e82fa448c4
12 changed files with 3558 additions and 0 deletions
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -0,0 +1,40 @@
+# Standard attention backend benchmark configuration
+
+model:
+  num_layers: 32
+  num_q_heads: 32
+  num_kv_heads: 8  # GQA with 4:1 ratio
+  head_dim: 128
+  block_size: 16
+
+batch_specs:
+  # Pure prefill
+  - "q512"      # Small prefill (512 tokens)
+  - "q2k"       # Medium prefill (2048 tokens)
+  - "q4k"       # Large prefill (4096 tokens)
+  - "q8k"       # Very large prefill (8192 tokens)
+
+  # Pure decode
+  - "8q1s1k"      # 8 requests, 1k KV cache each
+  - "16q1s2k"     # 16 requests, 2k KV cache each
+  - "32q1s1k"     # 32 requests, 1k KV cache each
+  - "64q1s4k"     # 64 requests, 4k KV cache each
+
+  # Mixed prefill/decode
+  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
+
+  # Context extension
+  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
+  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
+
+backends:
+  - flash
+  - triton
+  - flashinfer
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: false