[Attention] Add FlashInfer Sparse MLA backend (#33451)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
This commit is contained in:
Matthew Bonanni
2026-02-12 12:21:54 -05:00
committed by GitHub
parent 334c715e0f
commit f2c47886fd
24 changed files with 1181 additions and 408 deletions

View File

@@ -41,18 +41,17 @@ batch_specs:
# Backends that support query length > 1
backends:
- flashattn_mla # reorder_batch_threshold = 512
- flashmla # reorder_batch_threshold = 1 (tunable)
- FLASH_ATTN_MLA # reorder_batch_threshold = 512
- FLASHMLA # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - flashinfer_mla
# - FLASHINFER_MLA
# Benchmark settings
benchmark:
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization
parameter_sweep: