# MLA decode-only benchmark configuration model: name: "deepseek-v3" num_layers: 60 num_q_heads: 128 # Base value, can be swept for TP simulation num_kv_heads: 1 # MLA uses single latent KV head_dim: 576 kv_lora_rank: 512 qk_nope_head_dim: 128 qk_rope_head_dim: 64 v_head_dim: 128 block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads model_parameter_sweep: param_name: "num_q_heads" values: [128, 64, 32, 16] label_format: "{backend}_{value}h" batch_specs: # Small batches, varying sequence lengths - "16q1s512" # 16 requests, 512 KV cache - "16q1s1k" # 16 requests, 1k KV cache - "16q1s2k" # 16 requests, 2k KV cache - "16q1s4k" # 16 requests, 4k KV cache # Medium batches - "32q1s1k" # 32 requests, 1k KV cache - "32q1s2k" # 32 requests, 2k KV cache - "32q1s4k" # 32 requests, 4k KV cache - "32q1s8k" # 32 requests, 8k KV cache # Large batches - "64q1s1k" # 64 requests, 1k KV cache - "64q1s2k" # 64 requests, 2k KV cache - "64q1s4k" # 64 requests, 4k KV cache - "64q1s8k" # 64 requests, 8k KV cache # Very large batches - "128q1s1k" # 128 requests, 1k KV cache - "128q1s2k" # 128 requests, 2k KV cache - "128q1s4k" # 128 requests, 4k KV cache - "128q1s8k" # 128 requests, 8k KV cache # Long context - "32q1s16k" # 32 requests, 16k KV cache - "32q1s32k" # 32 requests, 32k KV cache backends: - FLASHMLA_SPARSE - FLASHINFER_MLA_SPARSE device: "cuda:0" repeats: 100 warmup_iters: 10 profile_memory: true