[Attention] Add FlashInfer Sparse MLA backend (#33451)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2026-02-12 12:21:54 -05:00
parent 334c715e0f
commit f2c47886fd
24 changed files with 1181 additions and 408 deletions
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,18 +41,17 @@ batch_specs:

 # Backends that support query length > 1
 backends:
-  - flashattn_mla    # reorder_batch_threshold = 512
-  - flashmla          # reorder_batch_threshold = 1 (tunable)
+  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
+  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)

 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - flashinfer_mla
+# - FLASHINFER_MLA

 # Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 10  # More repeats for statistical significance
-  warmup_iters: 5
-  profile_memory: false
+device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
+profile_memory: false

 # Test these threshold values for optimization
 parameter_sweep: