[Benchmark] Improvements to attention benchmark script (#37115)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-03-16 18:22:40 -04:00
parent e5b807607c
commit a3a51d20e7
6 changed files with 311 additions and 68 deletions
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -213,6 +213,9 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False

+    # "auto" or "fp8"
+    kv_cache_dtype: str = "auto"
+
    # MLA-specific
    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
@@ -369,6 +372,7 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
+                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -382,6 +386,7 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
+                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,