Vllm CPU benchmark suite improvement (#34128)

Signed-off-by: louie-tsai <louie.tsai@intel.com>
2026-02-12 00:04:44 -08:00
parent 386bfe5d08
commit 55a1a9563a
6 changed files with 799 additions and 251 deletions
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,8 +9,10 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
+from pathlib import Path

 import pandas as pd
+import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


+# -----------------------------
+# Export helpers (Excel + CSV)
+# -----------------------------
+def _sanitize_sheet_name(name: str) -> str:
+    """
+    Excel sheet constraints:
+      - max 31 chars
+      - cannot contain: : \ / ? * [ ]
+      - cannot be empty
+    """
+    name = "sheet" if name is None else str(name)
+    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+    name = name.strip().strip("'")
+    name = re.sub(r"\s+", " ", name)
+    if not name:
+        name = "sheet"
+    return name[:31]
+
+
+def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
+    d = dict(zip(group_cols, gkey_tuple))
+    model = d.get("Model", "model")
+    model_short = str(model).split("/")[-1]
+    ilen = d.get("Input Len", "")
+    olen = d.get("Output Len", "")
+    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
+    return _sanitize_sheet_name(f"{model_short}{lens}")
+
+
+def _write_tables_to_excel_sheet(
+    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
+):
+    startrow = 0
+    for title, df in blocks:
+        pd.DataFrame([[title]]).to_excel(
+            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+        )
+        startrow += 1
+        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+        startrow += len(df) + 3
+
+
+def _safe_filename(s: str) -> str:
+    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+    return s[:180] if len(s) > 180 else s
+
+
+# -----------------------------
+# vLLM environment export helper
+# -----------------------------
+def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
+    """Parse vllm_env.txt into a flat table (Section, Key, Value).
+
+    Supports:
+      - section headers as standalone lines (no ':' or '=')
+      - key-value lines like 'OS: Ubuntu ...'
+      - env var lines like 'HF_HOME=/data/hf'
+    """
+    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
+    section = "General"
+    rows: list[dict] = []
+
+    def set_section(s: str):
+        nonlocal section
+        s = (s or "").strip()
+        if s:
+            section = s
+
+    for raw in lines:
+        stripped = raw.strip()
+        if not stripped:
+            continue
+        # divider lines like =====
+        if set(stripped) <= {"="}:
+            continue
+
+        # section header heuristic: short standalone line
+        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
+            if stripped.lower().startswith("collecting environment information"):
+                continue
+            set_section(stripped)
+            continue
+
+        # env var style: KEY=VALUE (and not a URL with :)
+        if "=" in stripped and ":" not in stripped:
+            k, v = stripped.split("=", 1)
+            k = k.strip()
+            v = v.strip()
+            if k:
+                rows.append({"Section": section, "Key": k, "Value": v})
+            continue
+
+        # key: value
+        if ":" in stripped:
+            k, v = stripped.split(":", 1)
+            k = k.strip()
+            v = v.strip()
+            if k:
+                rows.append({"Section": section, "Key": k, "Value": v})
+            continue
+
+    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
+
+
+def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
+    """Load vllm_env.txt next to the *original* input JSON file.
+
+    Note: when only one -f is provided, the script may split JSON into ./splits/...,
+    but vllm_env.txt typically lives next to the original benchmark_results.json.
+    """
+    base_dir: Path | None = None
+    if getattr(args, "file", None):
+        base_dir = Path(args.file[0]).resolve().parent
+    elif files:
+        base_dir = Path(files[0]).resolve().parent
+    if base_dir is None:
+        return None
+
+    env_path = base_dir / "vllm_env.txt"
+    if not env_path.exists():
+        return None
+    df = _parse_vllm_env_txt(env_path)
+    return df
+
+
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(

    summary_df = pd.DataFrame(rows)

-    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(

    both_col = f"Max {conc_col} (Both)"

-    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
-        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


+def build_valid_max_concurrency_summary_df(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> pd.DataFrame | None:
+    if ttft_group_df is None and tpot_group_df is None:
+        return None
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    df = pd.DataFrame(rows)
+    for c in df.columns:
+        if c != "Configuration":
+            df[c] = pd.to_numeric(df[c], errors="coerce")
+    return df
+
+
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
+
+    # ---- NEW: export options ----
+    parser.add_argument(
+        "--excel-out",
+        type=str,
+        default="perf_comparison.xlsx",
+        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
+    )
+    parser.add_argument(
+        "--csv-out-dir",
+        type=str,
+        default="",
+        help="If set, write per-group per-metric CSVs into this directory.",
+    )
+
    return parser


@@ -657,7 +885,6 @@ def maybe_write_plot(
        markers=True,
    )

-    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

@@ -730,87 +957,151 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }

-    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-        main_fh.write('<meta charset="utf-8">\n')
-        for gkey in group_keys:
-            gkey_tuple = normalize_group_key(gkey)
-            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-            sub_path = group_filename(gkey_tuple)
-            group_header = (
-                '<div style="font-size: 1.4em; font-weight: 700; '
-                'margin: 18px 0 10px 0;">'
-                f"{_html.escape(suffix)}"
-                "</div>\n"
-            )
+    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
+    if csv_dir:
+        csv_dir.mkdir(parents=True, exist_ok=True)

-            main_fh.write(group_header)
-            with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                sub_fh.write('<meta charset="utf-8">\n')
-                sub_fh.write(group_header)
-                tput_group_df = None
-                ttft_group_df = None
-                tpot_group_df = None
-                conc_col = args.xaxis
+    excel_path = args.excel_out or "perf_comparison.xlsx"
+    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+        # ---- Environment sheet (first) ----
+        env_sheet = _sanitize_sheet_name("Environment")
+        env_df = _load_env_df_for_inputs(args, files)
+        if env_df is None or env_df.empty:
+            pd.DataFrame(
+                [
+                    {
+                        "Section": "Environment",
+                        "Key": "vllm_env.txt",
+                        "Value": "NOT FOUND (or empty)",
+                    }
+                ]
+            ).to_excel(xw, sheet_name=env_sheet, index=False)
+        else:
+            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+            main_fh.write('<meta charset="utf-8">\n')
+            for gkey in group_keys:
+                gkey_tuple = normalize_group_key(gkey)
+                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+                sub_path = group_filename(gkey_tuple)
+                group_header = (
+                    '<div style="font-size: 1.4em; font-weight: 700; '
+                    'margin: 18px 0 10px 0;">'
+                    f"{_html.escape(suffix)}"
+                    "</div>\n"
+                )

-                for metric_label in plan.data_cols:
-                    gb = metric_groupbys[metric_label]
-                    df_sorted, raw_data_cols = metric_cache[metric_label]
+                main_fh.write(group_header)

-                    try:
-                        group_df = gb.get_group(gkey)
-                    except KeyError:
-                        missing = (
-                            '<div style="font-size: 1.1em; font-weight: 600; '
-                            'margin: 10px 0;">'
-                            f"{_html.escape(metric_label)} — missing for this group"
-                            "</div>\n"
+                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
+                sheet_base = sheet
+                dedup_i = 1
+                while sheet in xw.sheets:
+                    dedup_i += 1
+                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+
+                excel_blocks: list[tuple[str, pd.DataFrame]] = []
+
+                with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                    sub_fh.write('<meta charset="utf-8">\n')
+                    sub_fh.write(group_header)
+                    tput_group_df = None
+                    ttft_group_df = None
+                    tpot_group_df = None
+                    conc_col = args.xaxis
+
+                    for metric_label in plan.data_cols:
+                        gb = metric_groupbys[metric_label]
+                        df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                        try:
+                            group_df = gb.get_group(gkey)
+                        except KeyError:
+                            missing = (
+                                '<div style="font-size: 1.1em; font-weight: 600; '
+                                'margin: 10px 0;">'
+                                f"{_html.escape(metric_label)} — missing for this group"
+                                "</div>\n"
+                            )
+                            main_fh.write(missing)
+                            sub_fh.write(missing)
+                            continue
+
+                        if conc_col not in group_df.columns:
+                            conc_col = _find_concurrency_col(group_df)
+
+                        mn = metric_label.lower().strip()
+                        if "tok/s" in mn:
+                            tput_group_df = group_df
+                        elif "ttft" in mn:
+                            ttft_group_df = group_df
+                        elif mn in ("p99", "median") or "tpot" in mn:
+                            tpot_group_df = group_df
+
+                        display_group = group_df.drop(
+                            columns=group_cols_canonical, errors="ignore"
                        )

-                        main_fh.write(missing)
-                        sub_fh.write(missing)
-                        continue
+                        html = render_metric_table_html(
+                            display_group, metric_label, suffix, args
+                        )
+                        main_fh.write(html)
+                        sub_fh.write(html)

-                    if conc_col not in group_df.columns:
-                        conc_col = _find_concurrency_col(group_df)
+                        maybe_write_plot(
+                            main_fh,
+                            sub_fh,
+                            group_df=group_df,
+                            raw_data_cols=raw_data_cols,
+                            metric_label=metric_label,
+                            y_axis_col=y_axis_col,
+                            args=args,
+                        )

-                    mn = metric_label.lower().strip()
-                    if "tok/s" in mn:
-                        tput_group_df = group_df
-                    elif "ttft" in mn:
-                        ttft_group_df = group_df
-                    elif mn in ("p99", "median") or "tpot" in mn:
-                        tpot_group_df = group_df
+                        excel_blocks.append(
+                            (metric_label, display_group.reset_index(drop=True))
+                        )
+                        if csv_dir:
+                            fn = _safe_filename(
+                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
+                                    "/", "_"
+                                )
+                            )
+                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)

-                    display_group = group_df.drop(
-                        columns=group_cols_canonical, errors="ignore"
-                    )
-
-                    html = render_metric_table_html(
-                        display_group, metric_label, suffix, args
-                    )
-                    main_fh.write(html)
-                    sub_fh.write(html)
-
-                    maybe_write_plot(
-                        main_fh,
-                        sub_fh,
-                        group_df=group_df,
-                        raw_data_cols=raw_data_cols,
-                        metric_label=metric_label,
-                        y_axis_col=y_axis_col,
+                    summary_html = build_valid_max_concurrency_summary_html(
+                        tput_group_df=tput_group_df,
+                        ttft_group_df=ttft_group_df,
+                        tpot_group_df=tpot_group_df,
+                        conc_col=conc_col,
                        args=args,
                    )
+                    if summary_html:
+                        main_fh.write(summary_html)
+                        sub_fh.write(summary_html)

-                summary_html = build_valid_max_concurrency_summary_html(
-                    tput_group_df=tput_group_df,
-                    ttft_group_df=ttft_group_df,
-                    tpot_group_df=tpot_group_df,
-                    conc_col=conc_col,
-                    args=args,
-                )
-                if summary_html:
-                    main_fh.write(summary_html)
-                    sub_fh.write(summary_html)
+                    summary_df = build_valid_max_concurrency_summary_df(
+                        tput_group_df=tput_group_df,
+                        ttft_group_df=ttft_group_df,
+                        tpot_group_df=tpot_group_df,
+                        conc_col=conc_col,
+                        args=args,
+                    )
+                    if summary_df is not None:
+                        excel_blocks.append(
+                            ("Valid Max Concurrency Summary", summary_df)
+                        )
+                        if csv_dir:
+                            fn = _safe_filename(
+                                f"{sheet}__Valid_Max_Concurrency_Summary"
+                            )
+                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
+
+                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+
+    print(f"Wrote Excel: {excel_path}")
+    if csv_dir:
+        print(f"Wrote CSVs under: {csv_dir}")


 def main():
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
-
-# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -9,6 +7,11 @@
 set -x
 set -o pipefail

+# Environment-driven debug controls (like ON_CPU=1)
+DRY_RUN="${DRY_RUN:-0}"
+MODEL_FILTER="${MODEL_FILTER:-}"
+DTYPE_FILTER="${DTYPE_FILTER:-}"
+
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -112,13 +115,12 @@ json2envs() {
 }

 wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
+    until curl -sf http://localhost:8000/v1/models >/dev/null; do
      sleep 1
-    done' && return 0 || return 1
+    done
+  '
 }

 kill_processes_launched_by_current_bash() {
@@ -252,37 +254,16 @@ run_benchmark_tests() {
  done
 }

-run_latency_tests() {
-  run_benchmark_tests "latency" "$1"
-}
+run_latency_tests() { run_benchmark_tests "latency" "$1"; }
+run_startup_tests() { run_benchmark_tests "startup" "$1"; }
+run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }

-run_startup_tests() {
-  run_benchmark_tests "startup" "$1"
-}
-
-run_throughput_tests() {
-  run_benchmark_tests "throughput" "$1"
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '
+merge_serving_tests_stream() {
+  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
+  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
+  local serving_test_file="$1"
+  # shellcheck disable=SC2016
+  local merged='
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -304,7 +285,50 @@ run_serving_tests() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  ' "$serving_test_file" | while read -r params; do
+  '
+
+  jq -c "$merged" "$serving_test_file" | \
+  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
+      select((($model|length)==0)
+             or ((.server_parameters.model // "") == $model)
+             or ((.client_parameters.model // "") == $model))
+      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
+    '
+  else
+    cat
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # In dry-run mode, if filters are provided but no tests match, fail fast.
+  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+    local count
+    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
+    if [[ "$count" -eq 0 ]]; then
+      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
+      return 0
+    fi
+  fi
+
+  # Iterate over serving tests (merged + optional filtered stream)
+  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -373,7 +397,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" ]]; then
+    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -384,6 +408,9 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
+    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
+        # dry-run: don't start server
+        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -402,9 +429,7 @@ run_serving_tests() {
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
        qps="inf"
-        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
@@ -425,7 +450,9 @@ run_serving_tests() {
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        bash -c "$client_command"
+        if [[ "${DRY_RUN:-0}" != "1" ]]; then
+          bash -c "$client_command"
+        fi

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -443,12 +470,15 @@ run_serving_tests() {
    done

    # clean up
-    kill -9 $server_pid
-    kill_gpu_processes
+    if [[ "${DRY_RUN:-0}" != "1" ]]; then
+      kill -9 $server_pid
+      kill_gpu_processes
+    fi
  done
 }

 main() {
+
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -458,7 +488,13 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-  check_hf_token
+
+  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    check_hf_token
+  else
+    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
+  fi

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -479,11 +515,16 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
-
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
+
+  if [[ "${DRY_RUN:-0}" == "1" ]]; then
+    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
+    exit 0
+  fi
+
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -0,0 +1,41 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      32,
+      64,
+      128
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "jinaai/jina-embeddings-v3",
+      "trust_remote_code": ""
+    },
+    "client_parameters": {
+      "model": "jinaai/jina-embeddings-v3",
+      "backend": "openai-embeddings",
+      "endpoint": "/v1/embeddings",
+      "dataset_name": "sharegpt",
+      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -0,0 +1,283 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,136 +148,6 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
    }
  ]
 }
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se

 ### How to find benchmark configuration examples for supported CPU models?

-For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
+For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in cpu test cases as serving-tests-cpu.json. Full test cases for Text-only models, Multi-Modal models and Embedded models are in cpu Text-Only test cases as serving-tests-cpu-text.json, cpu Multi-Modal test cases as serving-tests-cpu-multimodal.json and cpu Embedded test cases as serving-tests-cpu-embed.json.  
 For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details).
 To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.  

@@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}'
 For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
 , which publishes default-model CPU results produced using the same Benchmark Suite.

+#### Dry-Run
+
+For users only need to get the optimized runtime configurations without running benchmark, a Dry-Run mode is provided.
+By passing an environment variable DRY_RUN=1 with run-performance-benchmarks.sh,
+all commands will be generated under `./benchmark/results/`.
+
+```bash
+ON_CPU=1 DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+By providing different JSON file, users can get runtime configurations for different models such as Embedded Models.
+
+```bash
+ON_CPU=1 SERVING_JSON=serving-tests-cpu-embed.json DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+By providing MODEL_FILTER and DTYPE_FILTER, only commands for related model ID and Data Type will be generated.
+
+```bash
+ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-llama/Llama-3.1-8B-Instruct DTYPE_FILTER=bfloat16  bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?

 - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.