diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
index b3d0a2d3b..ead097411 100644
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,8 +9,10 @@ import json
import os
from dataclasses import dataclass
from importlib import util
+from pathlib import Path
import pandas as pd
+import regex as re
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+# -----------------------------
+# Export helpers (Excel + CSV)
+# -----------------------------
+def _sanitize_sheet_name(name: str) -> str:
+ """
+ Excel sheet constraints:
+ - max 31 chars
+ - cannot contain: : \ / ? * [ ]
+ - cannot be empty
+ """
+ name = "sheet" if name is None else str(name)
+ name = re.sub(r"[:\\/?*\[\]]", "_", name)
+ name = name.strip().strip("'")
+ name = re.sub(r"\s+", " ", name)
+ if not name:
+ name = "sheet"
+ return name[:31]
+
+
+def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
+ d = dict(zip(group_cols, gkey_tuple))
+ model = d.get("Model", "model")
+ model_short = str(model).split("/")[-1]
+ ilen = d.get("Input Len", "")
+ olen = d.get("Output Len", "")
+ lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
+ return _sanitize_sheet_name(f"{model_short}{lens}")
+
+
+def _write_tables_to_excel_sheet(
+ writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
+):
+ startrow = 0
+ for title, df in blocks:
+ pd.DataFrame([[title]]).to_excel(
+ writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+ )
+ startrow += 1
+ df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+ startrow += len(df) + 3
+
+
+def _safe_filename(s: str) -> str:
+ s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+ return s[:180] if len(s) > 180 else s
+
+
+# -----------------------------
+# vLLM environment export helper
+# -----------------------------
+def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
+ """Parse vllm_env.txt into a flat table (Section, Key, Value).
+
+ Supports:
+ - section headers as standalone lines (no ':' or '=')
+ - key-value lines like 'OS: Ubuntu ...'
+ - env var lines like 'HF_HOME=/data/hf'
+ """
+ lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
+ section = "General"
+ rows: list[dict] = []
+
+ def set_section(s: str):
+ nonlocal section
+ s = (s or "").strip()
+ if s:
+ section = s
+
+ for raw in lines:
+ stripped = raw.strip()
+ if not stripped:
+ continue
+ # divider lines like =====
+ if set(stripped) <= {"="}:
+ continue
+
+ # section header heuristic: short standalone line
+ if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
+ if stripped.lower().startswith("collecting environment information"):
+ continue
+ set_section(stripped)
+ continue
+
+ # env var style: KEY=VALUE (and not a URL with :)
+ if "=" in stripped and ":" not in stripped:
+ k, v = stripped.split("=", 1)
+ k = k.strip()
+ v = v.strip()
+ if k:
+ rows.append({"Section": section, "Key": k, "Value": v})
+ continue
+
+ # key: value
+ if ":" in stripped:
+ k, v = stripped.split(":", 1)
+ k = k.strip()
+ v = v.strip()
+ if k:
+ rows.append({"Section": section, "Key": k, "Value": v})
+ continue
+
+ return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
+
+
+def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
+ """Load vllm_env.txt next to the *original* input JSON file.
+
+ Note: when only one -f is provided, the script may split JSON into ./splits/...,
+ but vllm_env.txt typically lives next to the original benchmark_results.json.
+ """
+ base_dir: Path | None = None
+ if getattr(args, "file", None):
+ base_dir = Path(args.file[0]).resolve().parent
+ elif files:
+ base_dir = Path(files[0]).resolve().parent
+ if base_dir is None:
+ return None
+
+ env_path = base_dir / "vllm_env.txt"
+ if not env_path.exists():
+ return None
+ df = _parse_vllm_env_txt(env_path)
+ return df
+
+
# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows)
- # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns:
if c == "Configuration":
continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)"
- # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {}
for c in summary_df.columns:
if c == "Configuration":
continue
- # default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+def build_valid_max_concurrency_summary_df(
+ tput_group_df: pd.DataFrame | None,
+ ttft_group_df: pd.DataFrame | None,
+ tpot_group_df: pd.DataFrame | None,
+ conc_col: str,
+ args,
+) -> pd.DataFrame | None:
+ if ttft_group_df is None and tpot_group_df is None:
+ return None
+
+ ttft_cols = (
+ _config_value_columns(ttft_group_df, conc_col)
+ if ttft_group_df is not None
+ else []
+ )
+ tpot_cols = (
+ _config_value_columns(tpot_group_df, conc_col)
+ if tpot_group_df is not None
+ else []
+ )
+ tput_cols = (
+ _config_value_columns(tput_group_df, conc_col)
+ if tput_group_df is not None
+ else []
+ )
+
+ if ttft_group_df is not None and tpot_group_df is not None:
+ cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+ if tput_group_df is not None:
+ cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+ else:
+ cfg_cols = ttft_cols or tpot_cols
+
+ if not cfg_cols:
+ cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+ rows = []
+ for cfg in cfg_cols:
+ ttft_max = (
+ _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_max = (
+ _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+ both = (
+ pd.NA
+ if (pd.isna(ttft_max) or pd.isna(tpot_max))
+ else min(ttft_max, tpot_max)
+ )
+
+ tput_at_both = (
+ _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+ if tput_group_df is not None
+ else pd.NA
+ )
+ ttft_at_both = (
+ _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_at_both = (
+ _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+
+ rows.append(
+ {
+ "Configuration": cfg,
+ f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+ f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+ f"Max {conc_col} (Both)": both,
+ "Output Tput @ Both (tok/s)": tput_at_both,
+ "TTFT @ Both (ms)": ttft_at_both,
+ "TPOT @ Both (ms)": tpot_at_both,
+ }
+ )
+
+ df = pd.DataFrame(rows)
+ for c in df.columns:
+ if c != "Configuration":
+ df[c] = pd.to_numeric(df[c], errors="coerce")
+ return df
+
+
# -----------------------------
# Plot helper
# -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
+
+ # ---- NEW: export options ----
+ parser.add_argument(
+ "--excel-out",
+ type=str,
+ default="perf_comparison.xlsx",
+ help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
+ )
+ parser.add_argument(
+ "--csv-out-dir",
+ type=str,
+ default="",
+ help="If set, write per-group per-metric CSVs into this directory.",
+ )
+
return parser
@@ -657,7 +885,6 @@ def maybe_write_plot(
markers=True,
)
- # Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}")
fig.update_yaxes(tickformat=".2f")
@@ -730,87 +957,151 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items()
}
- with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
- main_fh.write('\n')
- for gkey in group_keys:
- gkey_tuple = normalize_group_key(gkey)
- suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
- sub_path = group_filename(gkey_tuple)
- group_header = (
- '
'
- f"{_html.escape(suffix)}"
- "
\n"
- )
+ csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
+ if csv_dir:
+ csv_dir.mkdir(parents=True, exist_ok=True)
- main_fh.write(group_header)
- with open(sub_path, "w", encoding="utf-8") as sub_fh:
- sub_fh.write('\n')
- sub_fh.write(group_header)
- tput_group_df = None
- ttft_group_df = None
- tpot_group_df = None
- conc_col = args.xaxis
+ excel_path = args.excel_out or "perf_comparison.xlsx"
+ with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+ # ---- Environment sheet (first) ----
+ env_sheet = _sanitize_sheet_name("Environment")
+ env_df = _load_env_df_for_inputs(args, files)
+ if env_df is None or env_df.empty:
+ pd.DataFrame(
+ [
+ {
+ "Section": "Environment",
+ "Key": "vllm_env.txt",
+ "Value": "NOT FOUND (or empty)",
+ }
+ ]
+ ).to_excel(xw, sheet_name=env_sheet, index=False)
+ else:
+ env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+ with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+ main_fh.write('\n')
+ for gkey in group_keys:
+ gkey_tuple = normalize_group_key(gkey)
+ suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+ sub_path = group_filename(gkey_tuple)
+ group_header = (
+ ''
+ f"{_html.escape(suffix)}"
+ "
\n"
+ )
- for metric_label in plan.data_cols:
- gb = metric_groupbys[metric_label]
- df_sorted, raw_data_cols = metric_cache[metric_label]
+ main_fh.write(group_header)
- try:
- group_df = gb.get_group(gkey)
- except KeyError:
- missing = (
- ''
- f"{_html.escape(metric_label)} — missing for this group"
- "
\n"
+ sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
+ sheet_base = sheet
+ dedup_i = 1
+ while sheet in xw.sheets:
+ dedup_i += 1
+ sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+
+ excel_blocks: list[tuple[str, pd.DataFrame]] = []
+
+ with open(sub_path, "w", encoding="utf-8") as sub_fh:
+ sub_fh.write('\n')
+ sub_fh.write(group_header)
+ tput_group_df = None
+ ttft_group_df = None
+ tpot_group_df = None
+ conc_col = args.xaxis
+
+ for metric_label in plan.data_cols:
+ gb = metric_groupbys[metric_label]
+ df_sorted, raw_data_cols = metric_cache[metric_label]
+
+ try:
+ group_df = gb.get_group(gkey)
+ except KeyError:
+ missing = (
+ ''
+ f"{_html.escape(metric_label)} — missing for this group"
+ "
\n"
+ )
+ main_fh.write(missing)
+ sub_fh.write(missing)
+ continue
+
+ if conc_col not in group_df.columns:
+ conc_col = _find_concurrency_col(group_df)
+
+ mn = metric_label.lower().strip()
+ if "tok/s" in mn:
+ tput_group_df = group_df
+ elif "ttft" in mn:
+ ttft_group_df = group_df
+ elif mn in ("p99", "median") or "tpot" in mn:
+ tpot_group_df = group_df
+
+ display_group = group_df.drop(
+ columns=group_cols_canonical, errors="ignore"
)
- main_fh.write(missing)
- sub_fh.write(missing)
- continue
+ html = render_metric_table_html(
+ display_group, metric_label, suffix, args
+ )
+ main_fh.write(html)
+ sub_fh.write(html)
- if conc_col not in group_df.columns:
- conc_col = _find_concurrency_col(group_df)
+ maybe_write_plot(
+ main_fh,
+ sub_fh,
+ group_df=group_df,
+ raw_data_cols=raw_data_cols,
+ metric_label=metric_label,
+ y_axis_col=y_axis_col,
+ args=args,
+ )
- mn = metric_label.lower().strip()
- if "tok/s" in mn:
- tput_group_df = group_df
- elif "ttft" in mn:
- ttft_group_df = group_df
- elif mn in ("p99", "median") or "tpot" in mn:
- tpot_group_df = group_df
+ excel_blocks.append(
+ (metric_label, display_group.reset_index(drop=True))
+ )
+ if csv_dir:
+ fn = _safe_filename(
+ f"{sheet}__{metric_label}".replace(" ", "_").replace(
+ "/", "_"
+ )
+ )
+ display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
- display_group = group_df.drop(
- columns=group_cols_canonical, errors="ignore"
- )
-
- html = render_metric_table_html(
- display_group, metric_label, suffix, args
- )
- main_fh.write(html)
- sub_fh.write(html)
-
- maybe_write_plot(
- main_fh,
- sub_fh,
- group_df=group_df,
- raw_data_cols=raw_data_cols,
- metric_label=metric_label,
- y_axis_col=y_axis_col,
+ summary_html = build_valid_max_concurrency_summary_html(
+ tput_group_df=tput_group_df,
+ ttft_group_df=ttft_group_df,
+ tpot_group_df=tpot_group_df,
+ conc_col=conc_col,
args=args,
)
+ if summary_html:
+ main_fh.write(summary_html)
+ sub_fh.write(summary_html)
- summary_html = build_valid_max_concurrency_summary_html(
- tput_group_df=tput_group_df,
- ttft_group_df=ttft_group_df,
- tpot_group_df=tpot_group_df,
- conc_col=conc_col,
- args=args,
- )
- if summary_html:
- main_fh.write(summary_html)
- sub_fh.write(summary_html)
+ summary_df = build_valid_max_concurrency_summary_df(
+ tput_group_df=tput_group_df,
+ ttft_group_df=ttft_group_df,
+ tpot_group_df=tpot_group_df,
+ conc_col=conc_col,
+ args=args,
+ )
+ if summary_df is not None:
+ excel_blocks.append(
+ ("Valid Max Concurrency Summary", summary_df)
+ )
+ if csv_dir:
+ fn = _safe_filename(
+ f"{sheet}__Valid_Max_Concurrency_Summary"
+ )
+ summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
+
+ _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+
+ print(f"Wrote Excel: {excel_path}")
+ if csv_dir:
+ print(f"Wrote CSVs under: {csv_dir}")
def main():
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index d62c01bc7..7dabcf517 100755
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,6 +1,4 @@
#!/bin/bash
-
-# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
@@ -9,6 +7,11 @@
set -x
set -o pipefail
+# Environment-driven debug controls (like ON_CPU=1)
+DRY_RUN="${DRY_RUN:-0}"
+MODEL_FILTER="${MODEL_FILTER:-}"
+DTYPE_FILTER="${DTYPE_FILTER:-}"
+
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
@@ -112,13 +115,12 @@ json2envs() {
}
wait_for_server() {
- # wait for vllm server to start
- # return 1 if vllm server crashes
local timeout_val="1200"
timeout "$timeout_val" bash -c '
- until curl -X POST localhost:8000/v1/completions; do
+ until curl -sf http://localhost:8000/v1/models >/dev/null; do
sleep 1
- done' && return 0 || return 1
+ done
+ '
}
kill_processes_launched_by_current_bash() {
@@ -252,37 +254,16 @@ run_benchmark_tests() {
done
}
-run_latency_tests() {
- run_benchmark_tests "latency" "$1"
-}
+run_latency_tests() { run_benchmark_tests "latency" "$1"; }
+run_startup_tests() { run_benchmark_tests "startup" "$1"; }
+run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
-run_startup_tests() {
- run_benchmark_tests "startup" "$1"
-}
-
-run_throughput_tests() {
- run_benchmark_tests "throughput" "$1"
-}
-
-run_serving_tests() {
- # run serving tests using `vllm bench serve` command
- # $1: a json file specifying serving test cases
- #
- # Supported JSON formats:
- # 1) Plain format: top-level array
- # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
- #
- # 2) Default parameters field + plain format tests
- # {
- # "defaults": { ... },
- # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
- # }
-
- local serving_test_file
- serving_test_file=$1
-
- # Iterate over serving tests
- jq -c '
+merge_serving_tests_stream() {
+ # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
+ # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
+ local serving_test_file="$1"
+ # shellcheck disable=SC2016
+ local merged='
if type == "array" then
# Plain format: test cases array
.[]
@@ -304,7 +285,50 @@ run_serving_tests() {
else
error("Unsupported serving test file format: must be array or object with .tests")
end
- ' "$serving_test_file" | while read -r params; do
+ '
+
+ jq -c "$merged" "$serving_test_file" | \
+ if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+ jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
+ select((($model|length)==0)
+ or ((.server_parameters.model // "") == $model)
+ or ((.client_parameters.model // "") == $model))
+ | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
+ '
+ else
+ cat
+ fi
+}
+
+run_serving_tests() {
+ # run serving tests using `vllm bench serve` command
+ # $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
+
+ local serving_test_file
+ serving_test_file=$1
+
+ # In dry-run mode, if filters are provided but no tests match, fail fast.
+ if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+ local count
+ count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
+ if [[ "$count" -eq 0 ]]; then
+ echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
+ return 0
+ fi
+ fi
+
+ # Iterate over serving tests (merged + optional filtered stream)
+ merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -373,7 +397,7 @@ run_serving_tests() {
echo "Server command: $server_command"
# support remote vllm server
client_remote_args=""
- if [[ -z "${REMOTE_HOST}" ]]; then
+ if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
@@ -384,6 +408,9 @@ run_serving_tests() {
echo ""
echo "vLLM failed to start within the timeout period."
fi
+ elif [[ "${DRY_RUN:-0}" == "1" ]]; then
+ # dry-run: don't start server
+ echo "Dry Run."
else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then
@@ -402,9 +429,7 @@ run_serving_tests() {
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
- echo "qps was $qps"
qps="inf"
- echo "now qps is $qps"
fi
# iterate over different max_concurrency
@@ -425,7 +450,9 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
- bash -c "$client_command"
+ if [[ "${DRY_RUN:-0}" != "1" ]]; then
+ bash -c "$client_command"
+ fi
# record the benchmarking commands
jq_output=$(jq -n \
@@ -443,12 +470,15 @@ run_serving_tests() {
done
# clean up
- kill -9 $server_pid
- kill_gpu_processes
+ if [[ "${DRY_RUN:-0}" != "1" ]]; then
+ kill -9 $server_pid
+ kill_gpu_processes
+ fi
done
}
main() {
+
local ARCH
ARCH=''
if [[ "$ON_CPU" == "1" ]]; then
@@ -458,7 +488,13 @@ main() {
check_gpus
ARCH="$arch_suffix"
fi
- check_hf_token
+
+ # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
+ if [[ "${DRY_RUN:-0}" != "1" ]]; then
+ check_hf_token
+ else
+ echo "DRY_RUN=1 -> skip HF_TOKEN validation"
+ fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -479,11 +515,16 @@ main() {
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
-
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking
- run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+ run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
+
+ if [[ "${DRY_RUN:-0}" == "1" ]]; then
+ echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
+ exit 0
+ fi
+
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
new file mode 100644
index 000000000..6d3455c47
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -0,0 +1,41 @@
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [
+ 32,
+ 64,
+ 128
+ ],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "dtype": "bfloat16",
+ "model": "jinaai/jina-embeddings-v3",
+ "trust_remote_code": ""
+ },
+ "client_parameters": {
+ "model": "jinaai/jina-embeddings-v3",
+ "backend": "openai-embeddings",
+ "endpoint": "/v1/embeddings",
+ "dataset_name": "sharegpt",
+ "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_jina_embed_v3_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {}
+ }
+ ]
+}
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
new file mode 100644
index 000000000..25ed7415e
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -0,0 +1,283 @@
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ }
+ ]
+}
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index 25ed7415e..e34ddcb6d 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,136 +148,6 @@
"random-input-len": 2048,
"random-output-len": 128
}
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 2
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 4
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_llama3B_tp1_random_128_128",
- "server_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_granite2B_tp1_random_128_128",
- "server_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_qwen1.7B_tp1_random_128_128",
- "server_parameters": {
- "model": "Qwen/Qwen3-1.7B",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "Qwen/Qwen3-1.7B",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_qwen4B_tp1_random_128_128",
- "server_parameters": {
- "model": "Qwen/Qwen3-4B",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "Qwen/Qwen3-4B",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_qwen8B_tp1_random_128_128",
- "server_parameters": {
- "model": "Qwen/Qwen3-8B",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "Qwen/Qwen3-8B",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_glm9B_tp1_random_128_128",
- "server_parameters": {
- "model": "zai-org/glm-4-9b-hf",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "zai-org/glm-4-9b-hf",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
- },
- {
- "test_name": "serving_gemma7B_tp1_random_128_128",
- "server_parameters": {
- "model": "google/gemma-7b",
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "model": "google/gemma-7b",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128
- }
}
]
}
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index aaa9b28ab..431de0d6a 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se
### How to find benchmark configuration examples for supported CPU models?
-For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
+For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in cpu test cases as serving-tests-cpu.json. Full test cases for Text-only models, Multi-Modal models and Embedded models are in cpu Text-Only test cases as serving-tests-cpu-text.json, cpu Multi-Modal test cases as serving-tests-cpu-multimodal.json and cpu Embedded test cases as serving-tests-cpu-embed.json.
For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details).
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
@@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}'
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
, which publishes default-model CPU results produced using the same Benchmark Suite.
+#### Dry-Run
+
+For users only need to get the optimized runtime configurations without running benchmark, a Dry-Run mode is provided.
+By passing an environment variable DRY_RUN=1 with run-performance-benchmarks.sh,
+all commands will be generated under `./benchmark/results/`.
+
+```bash
+ON_CPU=1 DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+By providing different JSON file, users can get runtime configurations for different models such as Embedded Models.
+
+```bash
+ON_CPU=1 SERVING_JSON=serving-tests-cpu-embed.json DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+By providing MODEL_FILTER and DTYPE_FILTER, only commands for related model ID and Data Type will be generated.
+
+```bash
+ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-llama/Llama-3.1-8B-Instruct DTYPE_FILTER=bfloat16 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.