Vllm CPU benchmark suite improvement (#34128)

Signed-off-by: louie-tsai <louie.tsai@intel.com>
2026-02-12 00:04:44 -08:00
parent 386bfe5d08
commit 55a1a9563a
6 changed files with 799 additions and 251 deletions
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,8 +9,10 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
+from pathlib import Path

 import pandas as pd
+import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


+# -----------------------------
+# Export helpers (Excel + CSV)
+# -----------------------------
+def _sanitize_sheet_name(name: str) -> str:
+    """
+    Excel sheet constraints:
+      - max 31 chars
+      - cannot contain: : \ / ? * [ ]
+      - cannot be empty
+    """
+    name = "sheet" if name is None else str(name)
+    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+    name = name.strip().strip("'")
+    name = re.sub(r"\s+", " ", name)
+    if not name:
+        name = "sheet"
+    return name[:31]
+
+
+def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
+    d = dict(zip(group_cols, gkey_tuple))
+    model = d.get("Model", "model")
+    model_short = str(model).split("/")[-1]
+    ilen = d.get("Input Len", "")
+    olen = d.get("Output Len", "")
+    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
+    return _sanitize_sheet_name(f"{model_short}{lens}")
+
+
+def _write_tables_to_excel_sheet(
+    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
+):
+    startrow = 0
+    for title, df in blocks:
+        pd.DataFrame([[title]]).to_excel(
+            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+        )
+        startrow += 1
+        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+        startrow += len(df) + 3
+
+
+def _safe_filename(s: str) -> str:
+    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+    return s[:180] if len(s) > 180 else s
+
+
+# -----------------------------
+# vLLM environment export helper
+# -----------------------------
+def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
+    """Parse vllm_env.txt into a flat table (Section, Key, Value).
+
+    Supports:
+      - section headers as standalone lines (no ':' or '=')
+      - key-value lines like 'OS: Ubuntu ...'
+      - env var lines like 'HF_HOME=/data/hf'
+    """
+    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
+    section = "General"
+    rows: list[dict] = []
+
+    def set_section(s: str):
+        nonlocal section
+        s = (s or "").strip()
+        if s:
+            section = s
+
+    for raw in lines:
+        stripped = raw.strip()
+        if not stripped:
+            continue
+        # divider lines like =====
+        if set(stripped) <= {"="}:
+            continue
+
+        # section header heuristic: short standalone line
+        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
+            if stripped.lower().startswith("collecting environment information"):
+                continue
+            set_section(stripped)
+            continue
+
+        # env var style: KEY=VALUE (and not a URL with :)
+        if "=" in stripped and ":" not in stripped:
+            k, v = stripped.split("=", 1)
+            k = k.strip()
+            v = v.strip()
+            if k:
+                rows.append({"Section": section, "Key": k, "Value": v})
+            continue
+
+        # key: value
+        if ":" in stripped:
+            k, v = stripped.split(":", 1)
+            k = k.strip()
+            v = v.strip()
+            if k:
+                rows.append({"Section": section, "Key": k, "Value": v})
+            continue
+
+    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
+
+
+def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
+    """Load vllm_env.txt next to the *original* input JSON file.
+
+    Note: when only one -f is provided, the script may split JSON into ./splits/...,
+    but vllm_env.txt typically lives next to the original benchmark_results.json.
+    """
+    base_dir: Path | None = None
+    if getattr(args, "file", None):
+        base_dir = Path(args.file[0]).resolve().parent
+    elif files:
+        base_dir = Path(files[0]).resolve().parent
+    if base_dir is None:
+        return None
+
+    env_path = base_dir / "vllm_env.txt"
+    if not env_path.exists():
+        return None
+    df = _parse_vllm_env_txt(env_path)
+    return df
+
+
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(

    summary_df = pd.DataFrame(rows)

-    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(

    both_col = f"Max {conc_col} (Both)"

-    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
-        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


+def build_valid_max_concurrency_summary_df(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> pd.DataFrame | None:
+    if ttft_group_df is None and tpot_group_df is None:
+        return None
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    df = pd.DataFrame(rows)
+    for c in df.columns:
+        if c != "Configuration":
+            df[c] = pd.to_numeric(df[c], errors="coerce")
+    return df
+
+
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
+
+    # ---- NEW: export options ----
+    parser.add_argument(
+        "--excel-out",
+        type=str,
+        default="perf_comparison.xlsx",
+        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
+    )
+    parser.add_argument(
+        "--csv-out-dir",
+        type=str,
+        default="",
+        help="If set, write per-group per-metric CSVs into this directory.",
+    )
+
    return parser


@@ -657,7 +885,6 @@ def maybe_write_plot(
        markers=True,
    )

-    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

@@ -730,87 +957,151 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }

-    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-        main_fh.write('<meta charset="utf-8">\n')
-        for gkey in group_keys:
-            gkey_tuple = normalize_group_key(gkey)
-            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-            sub_path = group_filename(gkey_tuple)
-            group_header = (
-                '<div style="font-size: 1.4em; font-weight: 700; '
-                'margin: 18px 0 10px 0;">'
-                f"{_html.escape(suffix)}"
-                "</div>\n"
-            )
+    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
+    if csv_dir:
+        csv_dir.mkdir(parents=True, exist_ok=True)

-            main_fh.write(group_header)
-            with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                sub_fh.write('<meta charset="utf-8">\n')
-                sub_fh.write(group_header)
-                tput_group_df = None
-                ttft_group_df = None
-                tpot_group_df = None
-                conc_col = args.xaxis
+    excel_path = args.excel_out or "perf_comparison.xlsx"
+    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+        # ---- Environment sheet (first) ----
+        env_sheet = _sanitize_sheet_name("Environment")
+        env_df = _load_env_df_for_inputs(args, files)
+        if env_df is None or env_df.empty:
+            pd.DataFrame(
+                [
+                    {
+                        "Section": "Environment",
+                        "Key": "vllm_env.txt",
+                        "Value": "NOT FOUND (or empty)",
+                    }
+                ]
+            ).to_excel(xw, sheet_name=env_sheet, index=False)
+        else:
+            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+            main_fh.write('<meta charset="utf-8">\n')
+            for gkey in group_keys:
+                gkey_tuple = normalize_group_key(gkey)
+                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+                sub_path = group_filename(gkey_tuple)
+                group_header = (
+                    '<div style="font-size: 1.4em; font-weight: 700; '
+                    'margin: 18px 0 10px 0;">'
+                    f"{_html.escape(suffix)}"
+                    "</div>\n"
+                )

-                for metric_label in plan.data_cols:
-                    gb = metric_groupbys[metric_label]
-                    df_sorted, raw_data_cols = metric_cache[metric_label]
+                main_fh.write(group_header)

-                    try:
-                        group_df = gb.get_group(gkey)
-                    except KeyError:
-                        missing = (
-                            '<div style="font-size: 1.1em; font-weight: 600; '
-                            'margin: 10px 0;">'
-                            f"{_html.escape(metric_label)} — missing for this group"
-                            "</div>\n"
+                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
+                sheet_base = sheet
+                dedup_i = 1
+                while sheet in xw.sheets:
+                    dedup_i += 1
+                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+
+                excel_blocks: list[tuple[str, pd.DataFrame]] = []
+
+                with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                    sub_fh.write('<meta charset="utf-8">\n')
+                    sub_fh.write(group_header)
+                    tput_group_df = None
+                    ttft_group_df = None
+                    tpot_group_df = None
+                    conc_col = args.xaxis
+
+                    for metric_label in plan.data_cols:
+                        gb = metric_groupbys[metric_label]
+                        df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                        try:
+                            group_df = gb.get_group(gkey)
+                        except KeyError:
+                            missing = (
+                                '<div style="font-size: 1.1em; font-weight: 600; '
+                                'margin: 10px 0;">'
+                                f"{_html.escape(metric_label)} — missing for this group"
+                                "</div>\n"
+                            )
+                            main_fh.write(missing)
+                            sub_fh.write(missing)
+                            continue
+
+                        if conc_col not in group_df.columns:
+                            conc_col = _find_concurrency_col(group_df)
+
+                        mn = metric_label.lower().strip()
+                        if "tok/s" in mn:
+                            tput_group_df = group_df
+                        elif "ttft" in mn:
+                            ttft_group_df = group_df
+                        elif mn in ("p99", "median") or "tpot" in mn:
+                            tpot_group_df = group_df
+
+                        display_group = group_df.drop(
+                            columns=group_cols_canonical, errors="ignore"
                        )

-                        main_fh.write(missing)
-                        sub_fh.write(missing)
-                        continue
+                        html = render_metric_table_html(
+                            display_group, metric_label, suffix, args
+                        )
+                        main_fh.write(html)
+                        sub_fh.write(html)

-                    if conc_col not in group_df.columns:
-                        conc_col = _find_concurrency_col(group_df)
+                        maybe_write_plot(
+                            main_fh,
+                            sub_fh,
+                            group_df=group_df,
+                            raw_data_cols=raw_data_cols,
+                            metric_label=metric_label,
+                            y_axis_col=y_axis_col,
+                            args=args,
+                        )

-                    mn = metric_label.lower().strip()
-                    if "tok/s" in mn:
-                        tput_group_df = group_df
-                    elif "ttft" in mn:
-                        ttft_group_df = group_df
-                    elif mn in ("p99", "median") or "tpot" in mn:
-                        tpot_group_df = group_df
+                        excel_blocks.append(
+                            (metric_label, display_group.reset_index(drop=True))
+                        )
+                        if csv_dir:
+                            fn = _safe_filename(
+                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
+                                    "/", "_"
+                                )
+                            )
+                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)

-                    display_group = group_df.drop(
-                        columns=group_cols_canonical, errors="ignore"
-                    )
-
-                    html = render_metric_table_html(
-                        display_group, metric_label, suffix, args
-                    )
-                    main_fh.write(html)
-                    sub_fh.write(html)
-
-                    maybe_write_plot(
-                        main_fh,
-                        sub_fh,
-                        group_df=group_df,
-                        raw_data_cols=raw_data_cols,
-                        metric_label=metric_label,
-                        y_axis_col=y_axis_col,
+                    summary_html = build_valid_max_concurrency_summary_html(
+                        tput_group_df=tput_group_df,
+                        ttft_group_df=ttft_group_df,
+                        tpot_group_df=tpot_group_df,
+                        conc_col=conc_col,
                        args=args,
                    )
+                    if summary_html:
+                        main_fh.write(summary_html)
+                        sub_fh.write(summary_html)

-                summary_html = build_valid_max_concurrency_summary_html(
-                    tput_group_df=tput_group_df,
-                    ttft_group_df=ttft_group_df,
-                    tpot_group_df=tpot_group_df,
-                    conc_col=conc_col,
-                    args=args,
-                )
-                if summary_html:
-                    main_fh.write(summary_html)
-                    sub_fh.write(summary_html)
+                    summary_df = build_valid_max_concurrency_summary_df(
+                        tput_group_df=tput_group_df,
+                        ttft_group_df=ttft_group_df,
+                        tpot_group_df=tpot_group_df,
+                        conc_col=conc_col,
+                        args=args,
+                    )
+                    if summary_df is not None:
+                        excel_blocks.append(
+                            ("Valid Max Concurrency Summary", summary_df)
+                        )
+                        if csv_dir:
+                            fn = _safe_filename(
+                                f"{sheet}__Valid_Max_Concurrency_Summary"
+                            )
+                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
+
+                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+
+    print(f"Wrote Excel: {excel_path}")
+    if csv_dir:
+        print(f"Wrote CSVs under: {csv_dir}")


 def main():