Add attention benchmarking tools (#26835)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
# MLA decode-only benchmark configuration
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
- "16q1s1k" # 16 requests, 1k KV cache
|
||||
- "16q1s2k" # 16 requests, 2k KV cache
|
||||
- "16q1s4k" # 16 requests, 4k KV cache
|
||||
|
||||
# Medium batches
|
||||
- "32q1s1k" # 32 requests, 1k KV cache
|
||||
- "32q1s2k" # 32 requests, 2k KV cache
|
||||
- "32q1s4k" # 32 requests, 4k KV cache
|
||||
- "32q1s8k" # 32 requests, 8k KV cache
|
||||
|
||||
# Large batches
|
||||
- "64q1s1k" # 64 requests, 1k KV cache
|
||||
- "64q1s2k" # 64 requests, 2k KV cache
|
||||
- "64q1s4k" # 64 requests, 4k KV cache
|
||||
- "64q1s8k" # 64 requests, 8k KV cache
|
||||
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Backend-specific tuning
|
||||
cutlass_mla:
|
||||
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||
|
||||
flashattn_mla:
|
||||
reorder_batch_threshold: 512
|
||||
|
||||
flashmla:
|
||||
reorder_batch_threshold: 1
|
||||
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
# MLA mixed batch benchmark (prefill + decode)
|
||||
# Tests chunked prefill performance
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
batch_specs:
|
||||
# Small prefill + decode
|
||||
- "1q1k_8q1s1k" # 1 prefill + 8 decode
|
||||
- "2q2k_16q1s1k" # 2 prefill + 16 decode
|
||||
- "4q1k_32q1s2k" # 4 prefill + 32 decode
|
||||
|
||||
# Medium prefill + decode
|
||||
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
|
||||
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
|
||||
- "8q2k_64q1s4k" # 8 prefill + 64 decode
|
||||
|
||||
# Large prefill + decode (chunked prefill stress test)
|
||||
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
|
||||
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
|
||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||
|
||||
# Context extension + decode
|
||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||
|
||||
# Explicitly chunked prefill
|
||||
- "q8k" # 8k prefill with chunking hint
|
||||
- "q16k" # 16k prefill with chunking hint
|
||||
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
|
||||
|
||||
# High decode ratio (realistic serving)
|
||||
- "1q2k_63q1s1k" # 1 prefill + 63 decode
|
||||
- "2q2k_62q1s2k" # 2 prefill + 62 decode
|
||||
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Analyze chunked prefill workspace size impact
|
||||
chunked_prefill:
|
||||
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
|
||||
@@ -0,0 +1,88 @@
|
||||
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
|
||||
# Question: At what query length does prefill pipeline become faster than decode pipeline?
|
||||
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
|
||||
# Applies to: FlashAttn MLA, FlashMLA
|
||||
|
||||
description: "Decode vs Prefill pipeline crossover analysis"
|
||||
|
||||
# Test FlashAttn MLA
|
||||
backend: flashattn_mla
|
||||
|
||||
# Mode: decode_vs_prefill comparison (special sweep mode)
|
||||
# For each batch spec, we'll test both decode and prefill pipelines
|
||||
mode: "decode_vs_prefill"
|
||||
|
||||
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
|
||||
# Each query length will be tested with BOTH decode and prefill pipelines:
|
||||
# - decode: threshold >= query_length (forces decode pipeline)
|
||||
# - prefill: threshold < query_length (forces prefill pipeline)
|
||||
#
|
||||
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
|
||||
# This tests different query lengths with fixed sequence length context
|
||||
#
|
||||
# Using batch_spec_ranges for automatic generation:
|
||||
batch_spec_ranges:
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 1
|
||||
stop: 16
|
||||
step: 1
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 16
|
||||
stop: 64
|
||||
step: 2
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 64
|
||||
stop: 1024
|
||||
step: 4
|
||||
end_inclusive: true
|
||||
|
||||
# Batch sizes to test (from old script)
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
|
||||
# Model configuration (DeepSeek V2/V3 defaults)
|
||||
model:
|
||||
num_layers: 10
|
||||
head_dim: 576
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
block_size: 128
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Output
|
||||
output:
|
||||
csv: "reorder_threshold_results.csv"
|
||||
json: "reorder_threshold_results.json"
|
||||
|
||||
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
|
||||
# - For each batch size, find the crossover point where prefill becomes faster than decode
|
||||
# - Show decode vs prefill performance across all query lengths
|
||||
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
|
||||
# - Understand how crossover point varies with batch size
|
||||
# - Provide data-driven guidance for default threshold value
|
||||
#
|
||||
# Methodology (from old script):
|
||||
# - Each query length tested with BOTH pipelines:
|
||||
# * decode: threshold >= query_length (forces decode pipeline)
|
||||
# * prefill: threshold < query_length (forces prefill pipeline)
|
||||
# - Compare which is faster to find crossover point
|
||||
#
|
||||
@@ -0,0 +1,62 @@
|
||||
# Speculative decoding benchmark configuration
|
||||
# Tests reorder_batch_threshold optimization
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
|
||||
batch_specs:
|
||||
# Pure speculative decode (K-token verification)
|
||||
- "q2s1k" # 2-token spec, 1k KV
|
||||
- "q4s1k" # 4-token spec, 1k KV
|
||||
- "q8s1k" # 8-token spec, 1k KV
|
||||
- "q16s1k" # 16-token spec, 1k KV
|
||||
|
||||
# Speculative with different context lengths
|
||||
- "q4s2k" # 4-token spec, 2k KV
|
||||
- "q4s4k" # 4-token spec, 4k KV
|
||||
- "q8s2k" # 8-token spec, 2k KV
|
||||
- "q8s4k" # 8-token spec, 4k KV
|
||||
|
||||
# Mixed: speculative + regular decode
|
||||
- "32q4s1k" # 32 spec requests
|
||||
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
|
||||
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
|
||||
|
||||
# Mixed: speculative + prefill + decode
|
||||
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
|
||||
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
|
||||
|
||||
# Large batches with speculation
|
||||
- "64q4s1k" # 64 spec requests
|
||||
- "32q8s2k" # 32 spec (8-token)
|
||||
- "16q16s4k" # 16 spec (16-token)
|
||||
|
||||
# Backends that support query length > 1
|
||||
backends:
|
||||
- flashattn_mla # reorder_batch_threshold = 512
|
||||
- flashmla # reorder_batch_threshold = 1 (tunable)
|
||||
|
||||
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
|
||||
# - flashinfer_mla
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Test these threshold values for optimization
|
||||
parameter_sweep:
|
||||
param_name: "reorder_batch_threshold"
|
||||
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||
include_auto: false
|
||||
label_format: "{backend}_threshold_{value}"
|
||||
@@ -0,0 +1,40 @@
|
||||
# Standard attention backend benchmark configuration
|
||||
|
||||
model:
|
||||
num_layers: 32
|
||||
num_q_heads: 32
|
||||
num_kv_heads: 8 # GQA with 4:1 ratio
|
||||
head_dim: 128
|
||||
block_size: 16
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "q512" # Small prefill (512 tokens)
|
||||
- "q2k" # Medium prefill (2048 tokens)
|
||||
- "q4k" # Large prefill (4096 tokens)
|
||||
- "q8k" # Very large prefill (8192 tokens)
|
||||
|
||||
# Pure decode
|
||||
- "8q1s1k" # 8 requests, 1k KV cache each
|
||||
- "16q1s2k" # 16 requests, 2k KV cache each
|
||||
- "32q1s1k" # 32 requests, 1k KV cache each
|
||||
- "64q1s4k" # 64 requests, 4k KV cache each
|
||||
|
||||
# Mixed prefill/decode
|
||||
- "2q2k_8q1s1k" # 2 prefill + 8 decode
|
||||
- "4q1k_16q1s2k" # 4 prefill + 16 decode
|
||||
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
|
||||
|
||||
# Context extension
|
||||
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
|
||||
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
|
||||
|
||||
backends:
|
||||
- flash
|
||||
- triton
|
||||
- flashinfer
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: false
|
||||
Reference in New Issue
Block a user