63 lines
1.0 KiB
YAML
63 lines
1.0 KiB
YAML
|
|
# MLA prefill-only benchmark configuration for sparse backends
|
||
|
|
|
||
|
|
model:
|
||
|
|
name: "deepseek-v3"
|
||
|
|
num_layers: 60
|
||
|
|
num_q_heads: 128
|
||
|
|
num_kv_heads: 1
|
||
|
|
head_dim: 576
|
||
|
|
kv_lora_rank: 512
|
||
|
|
qk_nope_head_dim: 128
|
||
|
|
qk_rope_head_dim: 64
|
||
|
|
v_head_dim: 128
|
||
|
|
block_size: 128
|
||
|
|
|
||
|
|
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||
|
|
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||
|
|
model_parameter_sweep:
|
||
|
|
param_name: "num_q_heads"
|
||
|
|
values: [128, 64, 32, 16]
|
||
|
|
label_format: "{backend}_{value}h"
|
||
|
|
|
||
|
|
batch_specs:
|
||
|
|
# Pure prefill
|
||
|
|
- "1q512"
|
||
|
|
- "1q1k"
|
||
|
|
- "1q2k"
|
||
|
|
- "1q4k"
|
||
|
|
- "1q8k"
|
||
|
|
|
||
|
|
# Batched pure prefill
|
||
|
|
- "2q512"
|
||
|
|
- "2q1k"
|
||
|
|
- "2q2k"
|
||
|
|
- "2q4k"
|
||
|
|
- "2q8k"
|
||
|
|
- "4q512"
|
||
|
|
- "4q1k"
|
||
|
|
- "4q2k"
|
||
|
|
- "4q4k"
|
||
|
|
- "4q8k"
|
||
|
|
- "8q512"
|
||
|
|
- "8q1k"
|
||
|
|
- "8q2k"
|
||
|
|
- "8q4k"
|
||
|
|
- "8q8k"
|
||
|
|
|
||
|
|
# Extend
|
||
|
|
- "1q512s4k"
|
||
|
|
- "1q512s8k"
|
||
|
|
- "1q1ks8k"
|
||
|
|
- "1q2ks8k"
|
||
|
|
- "1q2ks16k"
|
||
|
|
- "1q4ks16k"
|
||
|
|
|
||
|
|
backends:
|
||
|
|
- FLASHMLA_SPARSE
|
||
|
|
- FLASHINFER_MLA_SPARSE
|
||
|
|
|
||
|
|
device: "cuda:0"
|
||
|
|
repeats: 10
|
||
|
|
warmup_iters: 3
|
||
|
|
profile_memory: true
|