# Speculative decoding benchmark configuration # Tests reorder_batch_threshold optimization model: name: "deepseek-v3" num_layers: 60 num_q_heads: 128 num_kv_heads: 1 head_dim: 576 kv_lora_rank: 512 qk_nope_head_dim: 128 qk_rope_head_dim: 64 v_head_dim: 128 batch_specs: # Pure speculative decode (K-token verification) - "q2s1k" # 2-token spec, 1k KV - "q4s1k" # 4-token spec, 1k KV - "q8s1k" # 8-token spec, 1k KV - "q16s1k" # 16-token spec, 1k KV # Speculative with different context lengths - "q4s2k" # 4-token spec, 2k KV - "q4s4k" # 4-token spec, 4k KV - "q8s2k" # 8-token spec, 2k KV - "q8s4k" # 8-token spec, 4k KV # Mixed: speculative + regular decode - "32q4s1k" # 32 spec requests - "16q4s1k_16q1s1k" # 16 spec + 16 regular - "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular # Mixed: speculative + prefill + decode - "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode - "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode # Large batches with speculation - "64q4s1k" # 64 spec requests - "32q8s2k" # 32 spec (8-token) - "16q16s4k" # 16 spec (16-token) # Backends that support query length > 1 backends: - FLASH_ATTN_MLA # reorder_batch_threshold = 512 - FLASHMLA # reorder_batch_threshold = 1 (tunable) # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism # - FLASHINFER_MLA # Benchmark settings device: "cuda:0" repeats: 10 # More repeats for statistical significance warmup_iters: 5 profile_memory: false # Test these threshold values for optimization parameter_sweep: param_name: "reorder_batch_threshold" values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] include_auto: false label_format: "{backend}_threshold_{value}"