[Attention] Add FlashInfer Sparse MLA backend (#33451)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2026-02-12 12:21:54 -05:00
parent 334c715e0f
commit f2c47886fd
24 changed files with 1181 additions and 408 deletions
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -3,7 +3,7 @@
 model:
  name: "deepseek-v3"
  num_layers: 60
-  num_q_heads: 128
+  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
@@ -12,6 +12,13 @@ model:
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128

+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
@@ -34,28 +41,30 @@ batch_specs:
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
+  - "128q1s4k"     # 128 requests, 4k KV cache
+  - "128q1s8k"     # 128 requests, 8k KV cache

  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache

 backends:
-  - cutlass_mla
-  - flashinfer_mla
-  - flashattn_mla  # Hopper only
-  - flashmla        # Hopper only
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA  # Hopper only
+  - FLASHMLA        # Hopper only

 device: "cuda:0"
-repeats: 5
-warmup_iters: 3
+repeats: 100
+warmup_iters: 10
 profile_memory: true

 # Backend-specific tuning
-cutlass_mla:
+CUTLASS_MLA:
  num_kv_splits: auto  # or specific value like 4, 8, 16

-flashattn_mla:
+FLASH_ATTN_MLA:
  reorder_batch_threshold: 512

-flashmla:
+FLASHMLA:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -45,10 +45,10 @@ batch_specs:
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode

 backends:
-  - cutlass_mla
-  - flashinfer_mla
-  - flashattn_mla   # Hopper only
-  - flashmla        # Hopper only
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA   # Hopper only
+  - FLASHMLA         # Hopper only

 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -0,0 +1,62 @@
+# MLA prefill-only benchmark configuration for sparse backends
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Pure prefill
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"
+
+  # Batched pure prefill
+  - "2q512"
+  - "2q1k"
+  - "2q2k"
+  - "2q4k"
+  - "2q8k"
+  - "4q512"
+  - "4q1k"
+  - "4q2k"
+  - "4q4k"
+  - "4q8k"
+  - "8q512"
+  - "8q1k"
+  - "8q2k"
+  - "8q4k"
+  - "8q8k"
+
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"

 # Test FlashAttn MLA
-backend: flashattn_mla
+backend: FLASH_ATTN_MLA

 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,11 +62,10 @@ model:
  block_size: 128

 # Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 15          # More repeats for spec decode variance
-  warmup_iters: 5
-  profile_memory: false
+device: "cuda:0"
+repeats: 15          # More repeats for spec decode variance
+warmup_iters: 5
+profile_memory: false

 # Output
 output:
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,18 +41,17 @@ batch_specs:

 # Backends that support query length > 1
 backends:
-  - flashattn_mla    # reorder_batch_threshold = 512
-  - flashmla          # reorder_batch_threshold = 1 (tunable)
+  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
+  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)

 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - flashinfer_mla
+# - FLASHINFER_MLA

 # Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 10  # More repeats for statistical significance
-  warmup_iters: 5
-  profile_memory: false
+device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
+profile_memory: false

 # Test these threshold values for optimization
 parameter_sweep:
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -36,11 +36,11 @@ batch_specs:
  - "q1ks2k"          # 1k query, 2k sequence
  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence

-# Available backends: flash, triton, flashinfer
+# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
 backends:
-  - flash
-  - triton
-  - flashinfer
+  - FLASH_ATTN
+  - TRITON_ATTN
+  - FLASHINFER

 device: "cuda:0"
 repeats: 5