[Attention] Add FlashInfer Sparse MLA backend (#33451)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2026-02-12 12:21:54 -05:00
parent 334c715e0f
commit f2c47886fd
24 changed files with 1181 additions and 408 deletions
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"

 # Test FlashAttn MLA
-backend: flashattn_mla
+backend: FLASH_ATTN_MLA

 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,11 +62,10 @@ model:
  block_size: 128

 # Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 15          # More repeats for spec decode variance
-  warmup_iters: 5
-  profile_memory: false
+device: "cuda:0"
+repeats: 15          # More repeats for spec decode variance
+warmup_iters: 5
+profile_memory: false

 # Output
 output: