[Attention] Add FlashInfer Sparse MLA backend (#33451)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
@@ -12,6 +12,13 @@ model:
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
@@ -34,28 +41,30 @@ batch_specs:
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
- "128q1s4k" # 128 requests, 4k KV cache
|
||||
- "128q1s8k" # 128 requests, 8k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
- CUTLASS_MLA
|
||||
- FLASHINFER_MLA
|
||||
- FLASH_ATTN_MLA # Hopper only
|
||||
- FLASHMLA # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
repeats: 100
|
||||
warmup_iters: 10
|
||||
profile_memory: true
|
||||
|
||||
# Backend-specific tuning
|
||||
cutlass_mla:
|
||||
CUTLASS_MLA:
|
||||
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||
|
||||
flashattn_mla:
|
||||
FLASH_ATTN_MLA:
|
||||
reorder_batch_threshold: 512
|
||||
|
||||
flashmla:
|
||||
FLASHMLA:
|
||||
reorder_batch_threshold: 1
|
||||
|
||||
@@ -45,10 +45,10 @@ batch_specs:
|
||||
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
- CUTLASS_MLA
|
||||
- FLASHINFER_MLA
|
||||
- FLASH_ATTN_MLA # Hopper only
|
||||
- FLASHMLA # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
|
||||
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
# MLA prefill-only benchmark configuration for sparse backends
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "1q512"
|
||||
- "1q1k"
|
||||
- "1q2k"
|
||||
- "1q4k"
|
||||
- "1q8k"
|
||||
|
||||
# Batched pure prefill
|
||||
- "2q512"
|
||||
- "2q1k"
|
||||
- "2q2k"
|
||||
- "2q4k"
|
||||
- "2q8k"
|
||||
- "4q512"
|
||||
- "4q1k"
|
||||
- "4q2k"
|
||||
- "4q4k"
|
||||
- "4q8k"
|
||||
- "8q512"
|
||||
- "8q1k"
|
||||
- "8q2k"
|
||||
- "8q4k"
|
||||
- "8q8k"
|
||||
|
||||
# Extend
|
||||
- "1q512s4k"
|
||||
- "1q512s8k"
|
||||
- "1q1ks8k"
|
||||
- "1q2ks8k"
|
||||
- "1q2ks16k"
|
||||
- "1q4ks16k"
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 10
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
@@ -6,7 +6,7 @@
|
||||
description: "Decode vs Prefill pipeline crossover analysis"
|
||||
|
||||
# Test FlashAttn MLA
|
||||
backend: flashattn_mla
|
||||
backend: FLASH_ATTN_MLA
|
||||
|
||||
# Mode: decode_vs_prefill comparison (special sweep mode)
|
||||
# For each batch spec, we'll test both decode and prefill pipelines
|
||||
@@ -62,11 +62,10 @@ model:
|
||||
block_size: 128
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Output
|
||||
output:
|
||||
|
||||
@@ -41,18 +41,17 @@ batch_specs:
|
||||
|
||||
# Backends that support query length > 1
|
||||
backends:
|
||||
- flashattn_mla # reorder_batch_threshold = 512
|
||||
- flashmla # reorder_batch_threshold = 1 (tunable)
|
||||
- FLASH_ATTN_MLA # reorder_batch_threshold = 512
|
||||
- FLASHMLA # reorder_batch_threshold = 1 (tunable)
|
||||
|
||||
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
|
||||
# - flashinfer_mla
|
||||
# - FLASHINFER_MLA
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Test these threshold values for optimization
|
||||
parameter_sweep:
|
||||
|
||||
@@ -36,11 +36,11 @@ batch_specs:
|
||||
- "q1ks2k" # 1k query, 2k sequence
|
||||
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
|
||||
|
||||
# Available backends: flash, triton, flashinfer
|
||||
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
|
||||
backends:
|
||||
- flash
|
||||
- triton
|
||||
- flashinfer
|
||||
- FLASH_ATTN
|
||||
- TRITON_ATTN
|
||||
- FLASHINFER
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
|
||||
Reference in New Issue
Block a user