vllm/.buildkite/performance-benchmarks/scripts/compare-json-results.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

import argparse
import html as _html
import json
import os
from contextlib import nullcontext
from dataclasses import dataclass
from importlib import util
from pathlib import Path

import pandas as pd

pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None

DEFAULT_INFO_COLS = [
    "Model",
    "Dataset Name",
    "Input Len",
    "Output Len",
    #    "TP Size",
    #    "PP Size",
    "# of max concurrency.",
    "qps",
]

# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")


# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c

    for c in df.columns:
        if "concurr" in str(c).lower():
            s = df[c]
            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
                return c

    raise ValueError(
        "Cannot infer concurrency column. "
        "Please rename the column to one of the known names "
        "or add an explicit override (e.g., --concurrency-col)."
    )


def _normalize_concurrency_in_df(
    df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
    if canonical in df.columns:
        return df
    detected = _find_concurrency_col(df)
    if detected in df.columns and detected != canonical:
        return df.rename(columns={detected: canonical})
    df[canonical] = pd.NA
    return df


# -----------------------------
# Core data compare
# -----------------------------
def compare_data_columns(
    files: list[str],
    name_column: str,
    data_column: str,
    info_cols: list[str],
    drop_column: str,
    debug: bool = False,
):
    """
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
      (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.

    Minimal fix to support different max_concurrency lists across files:
      - normalize concurrency column naming to "# of max concurrency."
      - align on UNION of keys (missing points become NaN)
      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)

    frames = []
    raw_data_cols: list[str] = []

    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )

    union_index = None
    metas: list[pd.DataFrame] = []
    staged: list[tuple[str, pd.Series, pd.Series | None]] = []

    for file in files:
        df = pd.read_json(file, orient="records")
        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")

        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
        # NaN in P99/Median columns even if the column exists in the JSON.
        metric_lc = str(data_column).lower()
        is_latency_metric = (
            "ttft" in metric_lc
            or "tpot" in metric_lc
            or "p99" in metric_lc
            or "median" in metric_lc
            or metric_lc.strip() in {"p99", "median"}
        )
        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

        for c in (
            "Input Len",
            "Output Len",
            "TP Size",
            "PP Size",
            "# of max concurrency.",
            "qps",
        ):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")

        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA

        df_idx = df.set_index(key_cols, drop=False)

        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()

        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)

        if data_column in df_idx.columns:
            s = df_idx[data_column]
            if not s.index.is_unique:
                s = s.groupby(level=key_cols, dropna=False).mean()
        else:
            # keep NA series to preserve meta keys for union_index
            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label

        name_s = None
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"

        if union_index is None:
            union_index = meta.index
        else:
            union_index = union_index.union(meta.index)
        metas.append(meta)

        staged.append((file_label, s, name_s))

    if union_index is None:
        raise ValueError("No data found after loading inputs.")

    # meta first (union-aligned): build UNION meta across all files
    if metas:
        meta_union = pd.concat(metas, axis=0)
        # Collapse duplicates on the MultiIndex; keep first non-null per column
        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
        frames.append(meta_union.reindex(union_index))

    # values + ratios (union-aligned)
    metric_series_aligned: list[pd.Series] = []
    for file_label, s, name_s in staged:
        s_aligned = s.reindex(union_index)
        frames.append(s_aligned)
        raw_data_cols.append(file_label)
        metric_series_aligned.append(s_aligned)

        if debug and name_s is not None:
            frames.append(name_s.reindex(union_index))

        if len(metric_series_aligned) >= 2:
            base = metric_series_aligned[0]
            current = metric_series_aligned[-1]
            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
            frames.append(ratio)

    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)

    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]

    print(raw_data_cols)
    return concat_df, raw_data_cols


# -----------------------------
# Split helper
# -----------------------------
def split_json_by_tp_pp(
    input_file: str = "benchmark_results.json", output_root: str = "."
) -> list[str]:
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, dict):
        for key in ("results", "serving_results", "benchmarks", "data"):
            if isinstance(data.get(key), list):
                data = data[key]
                break

    df = pd.DataFrame(data)

    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
    if name_col:
        df = df[
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()

    rename_map = {
        "tp_size": "TP Size",
        "tensor_parallel_size": "TP Size",
        "pp_size": "PP Size",
        "pipeline_parallel_size": "PP Size",
    }
    df.rename(
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
    )

    if "TP Size" not in df.columns:
        df["TP Size"] = 1
    if "PP Size" not in df.columns:
        df["PP Size"] = 1

    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)

    saved_paths: list[str] = []
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
        os.makedirs(folder_name, exist_ok=True)
        filepath = os.path.join(folder_name, "benchmark_results.json")
        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
        print(f"Saved: {filepath}")
        saved_paths.append(filepath)

    return saved_paths


# -----------------------------
# Styling helpers
# -----------------------------
def _highlight_threshold(
    df: pd.DataFrame,
    threshold: float,
    slack_pct: float = 0.0,
) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
        c
        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
        if c in df.columns
    ]
    conf_cols = [
        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]

    try:
        slack_pct = float(slack_pct or 0.0)
    except Exception:
        slack_pct = 0.0
    slack_limit = threshold * (1.0 + slack_pct / 100.0)

    def _cell(v):
        if pd.isna(v):
            return ""
        if v <= threshold:
            # Strict SLA
            return "background-color:#e6ffe6;font-weight:bold;"
        if v <= slack_limit:
            # Within slack range
            return "background-color:#ffe5cc;font-weight:bold;"
        return ""

    return df.style.map(_cell, subset=conf_cols)


def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
    if not ratio_cols:
        return styler

    styler = styler.apply(
        lambda _: ["background-color: #fff3b0"] * len(styler.data),
        subset=ratio_cols,
        axis=0,
    )

    styler = styler.set_table_styles(
        [
            {
                "selector": f"th.col_heading.level0.col{i}",
                "props": [("background-color", "#fff3b0")],
            }
            for i, col in enumerate(styler.data.columns)
            if col in ratio_cols
        ],
        overwrite=False,
    )
    return styler


def _apply_two_decimals(
    styler: pd.io.formats.style.Styler,
) -> pd.io.formats.style.Styler:
    df = styler.data
    num_cols = df.select_dtypes("number").columns
    if len(num_cols) == 0:
        return styler
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
    """
    Excel sheet constraints:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty

    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)

    # Replace illegal characters with underscore.
    trans = str.maketrans(
        {
            ":": "_",
            "\\": "_",
            "/": "_",
            "?": "_",
            "*": "_",
            "[": "_",
            "]": "_",
        }
    )
    name = name.translate(trans)

    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
    name = " ".join(name.split())

    if not name:
        name = "sheet"
    return name[:31]


def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))

    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""

    # Shorten model name aggressively to make room for lens.
    model = d.get("Model", "model")
    leaf = str(model).split("/")[-1]

    max_model_len = max(1, 31 - len(lens))
    model_short = leaf[:max_model_len]

    return _sanitize_sheet_name(f"{model_short}{lens}")


def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
    """Write all blocks to a sheet with a single to_excel() call.

    Pandas+openpyxl can be extremely slow when called many times per sheet.
    We flatten blocks into one table with a 'Section' column to keep structure
    while making Excel generation fast and deterministic.
    """
    if not blocks:
        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
        return

    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
        df2 = df.copy()
        # Put the section label as the first column for readability.
        df2.insert(0, "Section", title)
        combined_parts.append(df2)

    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
    combined.to_excel(writer, sheet_name=sheet, index=False)


def _safe_filename(s: str) -> str:
    # Fast path without the third-party `regex` module.
    s = " ".join(str(s).strip().split())
    allowed = []
    for ch in s:
        if ch.isalnum() or ch in "._-":
            allowed.append(ch)
        else:
            allowed.append("_")
    out = "".join(allowed)
    return out[:180] if len(out) > 180 else out


# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
    """Parse vllm_env.txt into a flat table (Section, Key, Value).

    Supports:
      - section headers as standalone lines (no ':' or '=')
      - key-value lines like 'OS: Ubuntu ...'
      - env var lines like 'HF_HOME=/data/hf'
    """
    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
    section = "General"
    rows: list[dict] = []

    def set_section(s: str):
        nonlocal section
        s = (s or "").strip()
        if s:
            section = s

    for raw in lines:
        stripped = raw.strip()
        if not stripped:
            continue
        # divider lines like =====
        if set(stripped) <= {"="}:
            continue

        # section header heuristic: short standalone line
        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
            if stripped.lower().startswith("collecting environment information"):
                continue
            set_section(stripped)
            continue

        # env var style: KEY=VALUE (and not a URL with :)
        if "=" in stripped and ":" not in stripped:
            k, v = stripped.split("=", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue

        # key: value
        if ":" in stripped:
            k, v = stripped.split(":", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue

    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])


def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
    """Load vllm_env.txt next to the *original* input JSON file.

    Note: when only one -f is provided, the script may split JSON into ./splits/...,
    but vllm_env.txt typically lives next to the original benchmark_results.json.
    """
    base_dir: Path | None = None
    if getattr(args, "file", None):
        base_dir = Path(args.file[0]).resolve().parent
    elif files:
        base_dir = Path(files[0]).resolve().parent
    if base_dir is None:
        return None

    env_path = base_dir / "vllm_env.txt"
    if not env_path.exists():
        return None
    df = _parse_vllm_env_txt(env_path)
    return df


# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
    key_cols = [
        c
        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
        if c in df.columns
    ]
    exclude = set(key_cols + [conc_col, "qps", "QPS"])

    cols: list[str] = []
    for c in df.columns:
        if c in exclude:
            continue
        lc = str(c).lower()
        if lc.startswith("ratio"):
            continue
        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            cols.append(c)
    return cols


def _max_concurrency_ok(
    df: pd.DataFrame,
    conc_col: str,
    cfg_col: str,
    threshold: float,
    slack_pct: float = 0.0,
):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA

    d = df[[conc_col, cfg_col]].copy()
    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
    d = d.dropna(subset=[conc_col, cfg_col])

    if d.empty:
        return pd.NA

    # Accept values up to (1 + slack_pct%) above the SLA.
    try:
        slack_pct = float(slack_pct or 0.0)
    except Exception:
        slack_pct = 0.0
    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)

    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA

    return ok[conc_col].max()


def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
    if (
        df is None
        or conc_col not in df.columns
        or cfg_col not in df.columns
        or pd.isna(conc_value)
    ):
        return pd.NA

    d = df[[conc_col, cfg_col]].copy()
    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")

    conc_value = pd.to_numeric(conc_value, errors="coerce")
    if pd.isna(conc_value):
        return pd.NA

    hit = d[d[conc_col] == conc_value]
    if hit.empty:
        return pd.NA
    return hit[cfg_col].iloc[0]


def build_valid_max_concurrency_summary_html(
    tput_group_df: pd.DataFrame | None,
    ttft_group_df: pd.DataFrame | None,
    tpot_group_df: pd.DataFrame | None,
    conc_col: str,
    args,
) -> str:
    if ttft_group_df is None and tpot_group_df is None:
        return ""

    ttft_cols = (
        _config_value_columns(ttft_group_df, conc_col)
        if ttft_group_df is not None
        else []
    )
    tpot_cols = (
        _config_value_columns(tpot_group_df, conc_col)
        if tpot_group_df is not None
        else []
    )
    tput_cols = (
        _config_value_columns(tput_group_df, conc_col)
        if tput_group_df is not None
        else []
    )

    if ttft_group_df is not None and tpot_group_df is not None:
        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
        if tput_group_df is not None:
            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
    else:
        cfg_cols = ttft_cols or tpot_cols

    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"

    rows = []
    for cfg in cfg_cols:
        ttft_max = (
            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
        both = (
            pd.NA
            if (pd.isna(ttft_max) or pd.isna(tpot_max))
            else min(ttft_max, tpot_max)
        )

        tput_at_both = (
            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
            if tput_group_df is not None
            else pd.NA
        )
        ttft_at_both = (
            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_at_both = (
            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
            if tpot_group_df is not None
            else pd.NA
        )

        rows.append(
            {
                "Configuration": cfg,
                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
                "TPOT @ Both (ms)": tpot_at_both,
            }
        )

    summary_df = pd.DataFrame(rows)

    for c in summary_df.columns:
        if c == "Configuration":
            continue
        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")

    both_col = f"Max {conc_col} (Both)"

    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)

    def _green(v):
        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""

    if both_col in summary_df.columns:
        styler = styler.map(_green, subset=[both_col])

    title = (
        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
        "Valid Max Concurrency Summary"
        "</div>\n"
    )
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


def build_valid_max_concurrency_summary_df(
    tput_group_df: pd.DataFrame | None,
    ttft_group_df: pd.DataFrame | None,
    tpot_group_df: pd.DataFrame | None,
    conc_col: str,
    args,
) -> pd.DataFrame | None:
    if ttft_group_df is None and tpot_group_df is None:
        return None

    ttft_cols = (
        _config_value_columns(ttft_group_df, conc_col)
        if ttft_group_df is not None
        else []
    )
    tpot_cols = (
        _config_value_columns(tpot_group_df, conc_col)
        if tpot_group_df is not None
        else []
    )
    tput_cols = (
        _config_value_columns(tput_group_df, conc_col)
        if tput_group_df is not None
        else []
    )

    if ttft_group_df is not None and tpot_group_df is not None:
        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
        if tput_group_df is not None:
            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
    else:
        cfg_cols = ttft_cols or tpot_cols

    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"

    rows = []
    for cfg in cfg_cols:
        ttft_max = (
            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
        both = (
            pd.NA
            if (pd.isna(ttft_max) or pd.isna(tpot_max))
            else min(ttft_max, tpot_max)
        )

        tput_at_both = (
            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
            if tput_group_df is not None
            else pd.NA
        )
        ttft_at_both = (
            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_at_both = (
            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
            if tpot_group_df is not None
            else pd.NA
        )

        rows.append(
            {
                "Configuration": cfg,
                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
                "TPOT @ Both (ms)": tpot_at_both,
            }
        )

    df = pd.DataFrame(rows)
    for c in df.columns:
        if c != "Configuration":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df


# -----------------------------
# Plot helper
# -----------------------------
def _add_limit_line(fig, y_value: float, label: str):
    fig.add_hline(
        y=y_value,
        line_dash="dash",
        line_color="red" if "ttft" in label.lower() else "blue",
        annotation_text=f"{label}: {y_value} ms",
        annotation_position="top left",
    )
    if plotly_found:
        import plotly.graph_objects as go

        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="lines",
                line=dict(
                    dash="dash",
                    color="red" if "ttft" in label.lower() else "blue",
                ),
                name=label,
            )
        )


# -----------------------------
# Refactored main + group-first report
# -----------------------------
@dataclass(frozen=True)
class MetricPlan:
    data_cols: list[str]
    drop_column: str


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", action="append", type=str, help="input file name"
    )
    parser.add_argument(
        "--debug", action="store_true", help="show all information for debugging"
    )
    parser.add_argument(
        "--plot",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="plot perf diagrams or not --no-plot --plot",
    )
    parser.add_argument(
        "-x",
        "--xaxis",
        type=str,
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
    parser.add_argument(
        "-l",
        "--latency",
        type=str,
        default="p99",
        help="take median|p99 for latency like TTFT/TPOT",
    )
    parser.add_argument(
        "--ttft-max-ms",
        type=float,
        default=3000.0,
        help="Reference limit for TTFT plots (ms)",
    )
    parser.add_argument(
        "--tpot-max-ms",
        type=float,
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )

    # ---- SLA tolerance (slack) options ----
    parser.add_argument(
        "--ttft-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TTFT SLA (default: 5).",
    )
    parser.add_argument(
        "--tpot-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TPOT SLA (default: 5).",
    )

    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
        default="perf_comparison.xlsx",
        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
    )
    parser.add_argument(
        "--csv-out-dir",
        type=str,
        default="",
        help="If set, write per-group per-metric CSVs into this directory.",
    )

    return parser


def choose_metrics(latency: str) -> MetricPlan:
    latency = (latency or "").lower()
    drop_column = "P99"

    if "median" in latency:
        return MetricPlan(
            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
            drop_column=drop_column,
        )

    return MetricPlan(
        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
        drop_column=drop_column,
    )


def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
    if not args.file:
        raise ValueError("No input files provided. Use -f/--file.")

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
    else:
        files = args.file

    return files, info_cols


def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
    return info_cols[y_axis_index]


def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
    filtered_info_cols = info_cols[:4]
    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
    if not group_cols:
        raise ValueError(
            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
            f"but DataFrame has: {list(output_df.columns)}"
        )
    return group_cols


def normalize_group_key(name):
    return name if isinstance(name, tuple) else (name,)


def group_filename(name, prefix: str = "perf_comparison_") -> str:
    name_vals = normalize_group_key(name)
    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
    return f"{prefix}{safe}.html"


def build_group_suffix(group_cols: list[str], name) -> str:
    name_vals = normalize_group_key(name)
    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))


def render_metric_table_html(
    display_group: pd.DataFrame,
    metric_label: str,
    group_suffix: str,
    args,
) -> str:
    title = (
        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
        f"{_html.escape(metric_label)}"
        f" — {_html.escape(group_suffix)}"
        f"</div>\n"
    )

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
        styler = _highlight_threshold(
            display_group, args.ttft_max_ms, args.ttft_slack_pct
        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
        styler = _highlight_threshold(
            display_group, args.tpot_max_ms, args.tpot_slack_pct
        )
    else:
        styler = display_group.style

    styler = _apply_two_decimals(styler)
    styler = highlight_ratio_columns(styler)

    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


def maybe_write_plot(
    main_fh,
    sub_fh,
    group_df: pd.DataFrame,
    raw_data_cols: list[str],
    metric_label: str,
    y_axis_col: str,
    args,
):
    if not (args.plot and plotly_found):
        return

    import plotly.express as px

    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
    df_melted = df.melt(
        id_vars=y_axis_col,
        var_name="Configuration",
        value_name=metric_label,
    )

    fig = px.line(
        df_melted,
        x=y_axis_col,
        y=metric_label,
        color="Configuration",
        title=f"{metric_label} vs {y_axis_col}",
        markers=True,
    )

    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")

    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
    main_fh.write(html)
    sub_fh.write(html)


def build_group_keys(
    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
):
    if sort_cols:
        df = df.sort_values(by=sort_cols)
    gb = df.groupby(group_cols, dropna=False)
    return [k for k, _ in gb]


def write_report_group_first(
    files: list[str], info_cols: list[str], plan: MetricPlan, args
):
    name_column = "Test name"
    y_axis_col = get_y_axis_col(info_cols, args.xaxis)

    print("comparing : " + ", ".join(files))

    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
    group_cols_canonical: list[str] | None = None

    for metric_label in plan.data_cols:
        output_df, raw_data_cols = compare_data_columns(
            files,
            name_column,
            metric_label,
            info_cols,
            plan.drop_column,
            debug=args.debug,
        )

        raw_data_cols = list(raw_data_cols)
        raw_data_cols.insert(0, y_axis_col)

        group_cols = get_group_cols(output_df, info_cols)
        if group_cols_canonical is None:
            group_cols_canonical = group_cols
        else:
            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]

        metric_cache[metric_label] = (
            output_df.sort_values(by=args.xaxis),
            raw_data_cols,
        )

    if not group_cols_canonical:
        raise ValueError("No canonical group columns found across metrics.")

    first_metric = plan.data_cols[0]
    first_df_sorted, _ = metric_cache[first_metric]
    group_keys = build_group_keys(
        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
    )

    metric_groupbys = {
        metric_label: df.groupby(group_cols_canonical, dropna=False)
        for metric_label, (df, _) in metric_cache.items()
    }

    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
    if csv_dir:
        csv_dir.mkdir(parents=True, exist_ok=True)

    excel_path = args.excel_out or "perf_comparison.xlsx"
    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"

    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
    excel_engine = (
        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
    )
    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
        excel_engine = "openpyxl"

    excel_engine_kwargs = {}
    if excel_engine == "xlsxwriter":
        # Reduce memory pressure & usually faster writes.
        excel_engine_kwargs = {"options": {"constant_memory": True}}

    xw_ctx = (
        nullcontext(None)
        if disable_excel
        else pd.ExcelWriter(
            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
        )
    )
    with xw_ctx as xw:
        used_sheets: set[str] = set()
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
        if xw is not None:
            if env_df is None or env_df.empty:
                pd.DataFrame(
                    [
                        {
                            "Section": "Environment",
                            "Key": "vllm_env.txt",
                            "Value": "NOT FOUND (or empty)",
                        }
                    ]
                ).to_excel(xw, sheet_name=env_sheet, index=False)
            else:
                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
            used_sheets.add(env_sheet)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
                gkey_tuple = normalize_group_key(gkey)
                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
                sub_path = group_filename(gkey_tuple)
                group_header = (
                    '<div style="font-size: 1.4em; font-weight: 700; '
                    'margin: 18px 0 10px 0;">'
                    f"{_html.escape(suffix)}"
                    "</div>\n"
                )

                main_fh.write(group_header)

                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
                if do_excel:
                    dedup_i = 1
                    while sheet in used_sheets:
                        dedup_i += 1
                        suffix = f"_{dedup_i}"
                        # Ensure uniqueness even when sheet names are truncated.
                        base = str(sheet_base)
                        keep = max(1, 31 - len(suffix))
                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
                    used_sheets.add(sheet)

                excel_blocks: list[tuple[str, pd.DataFrame]] = []

                with open(sub_path, "w", encoding="utf-8") as sub_fh:
                    sub_fh.write('<meta charset="utf-8">\n')
                    sub_fh.write(group_header)
                    tput_group_df = None
                    ttft_group_df = None
                    tpot_group_df = None
                    conc_col = args.xaxis

                    for metric_label in plan.data_cols:
                        gb = metric_groupbys[metric_label]
                        df_sorted, raw_data_cols = metric_cache[metric_label]

                        try:
                            group_df = gb.get_group(gkey)
                        except KeyError:
                            missing = (
                                '<div style="font-size: 1.1em; font-weight: 600; '
                                'margin: 10px 0;">'
                                f"{_html.escape(metric_label)} — missing for this group"
                                "</div>\n"
                            )
                            main_fh.write(missing)
                            sub_fh.write(missing)
                            continue

                        if conc_col not in group_df.columns:
                            conc_col = _find_concurrency_col(group_df)

                        mn = metric_label.lower().strip()
                        if "tok/s" in mn:
                            tput_group_df = group_df
                        elif "ttft" in mn:
                            ttft_group_df = group_df
                        elif mn in ("p99", "median") or "tpot" in mn:
                            tpot_group_df = group_df

                        display_group = group_df.drop(
                            columns=group_cols_canonical, errors="ignore"
                        )

                        html = render_metric_table_html(
                            display_group, metric_label, suffix, args
                        )
                        main_fh.write(html)
                        sub_fh.write(html)

                        maybe_write_plot(
                            main_fh,
                            sub_fh,
                            group_df=group_df,
                            raw_data_cols=raw_data_cols,
                            metric_label=metric_label,
                            y_axis_col=y_axis_col,
                            args=args,
                        )

                        excel_blocks.append(
                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
                                    "/", "_"
                                )
                            )
                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)

                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
                        tpot_group_df=tpot_group_df,
                        conc_col=conc_col,
                        args=args,
                    )
                    if summary_html:
                        main_fh.write(summary_html)
                        sub_fh.write(summary_html)

                    summary_df = build_valid_max_concurrency_summary_df(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
                        tpot_group_df=tpot_group_df,
                        conc_col=conc_col,
                        args=args,
                    )
                    if summary_df is not None:
                        excel_blocks.append(
                            ("Valid Max Concurrency Summary", summary_df)
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__Valid_Max_Concurrency_Summary"
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)

                if do_excel:
                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)

    if disable_excel:
        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
    else:
        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")


def main():
    args = build_parser().parse_args()
    info_cols = list(DEFAULT_INFO_COLS)
    plan = choose_metrics(args.latency)
    files, info_cols = prepare_input_files(args, info_cols)
    write_report_group_first(files, info_cols, plan, args)


if __name__ == "__main__":
    main()