diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index b3d0a2d3b..ead097411 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,8 +9,10 @@ import json import os from dataclasses import dataclass from importlib import util +from pathlib import Path import pandas as pd +import regex as re pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None @@ -275,6 +277,131 @@ def _apply_two_decimals( return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") +# ----------------------------- +# Export helpers (Excel + CSV) +# ----------------------------- +def _sanitize_sheet_name(name: str) -> str: + """ + Excel sheet constraints: + - max 31 chars + - cannot contain: : \ / ? * [ ] + - cannot be empty + """ + name = "sheet" if name is None else str(name) + name = re.sub(r"[:\\/?*\[\]]", "_", name) + name = name.strip().strip("'") + name = re.sub(r"\s+", " ", name) + if not name: + name = "sheet" + return name[:31] + + +def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str: + d = dict(zip(group_cols, gkey_tuple)) + model = d.get("Model", "model") + model_short = str(model).split("/")[-1] + ilen = d.get("Input Len", "") + olen = d.get("Output Len", "") + lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else "" + return _sanitize_sheet_name(f"{model_short}{lens}") + + +def _write_tables_to_excel_sheet( + writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]] +): + startrow = 0 + for title, df in blocks: + pd.DataFrame([[title]]).to_excel( + writer, sheet_name=sheet, index=False, header=False, startrow=startrow + ) + startrow += 1 + df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow) + startrow += len(df) + 3 + + +def _safe_filename(s: str) -> str: + s = re.sub(r"[^\w\-.]+", "_", str(s).strip()) + return s[:180] if len(s) > 180 else s + + +# ----------------------------- +# vLLM environment export helper +# ----------------------------- +def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame: + """Parse vllm_env.txt into a flat table (Section, Key, Value). + + Supports: + - section headers as standalone lines (no ':' or '=') + - key-value lines like 'OS: Ubuntu ...' + - env var lines like 'HF_HOME=/data/hf' + """ + lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines() + section = "General" + rows: list[dict] = [] + + def set_section(s: str): + nonlocal section + s = (s or "").strip() + if s: + section = s + + for raw in lines: + stripped = raw.strip() + if not stripped: + continue + # divider lines like ===== + if set(stripped) <= {"="}: + continue + + # section header heuristic: short standalone line + if ":" not in stripped and "=" not in stripped and len(stripped) <= 64: + if stripped.lower().startswith("collecting environment information"): + continue + set_section(stripped) + continue + + # env var style: KEY=VALUE (and not a URL with :) + if "=" in stripped and ":" not in stripped: + k, v = stripped.split("=", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + # key: value + if ":" in stripped: + k, v = stripped.split(":", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + return pd.DataFrame(rows, columns=["Section", "Key", "Value"]) + + +def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None: + """Load vllm_env.txt next to the *original* input JSON file. + + Note: when only one -f is provided, the script may split JSON into ./splits/..., + but vllm_env.txt typically lives next to the original benchmark_results.json. + """ + base_dir: Path | None = None + if getattr(args, "file", None): + base_dir = Path(args.file[0]).resolve().parent + elif files: + base_dir = Path(files[0]).resolve().parent + if base_dir is None: + return None + + env_path = base_dir / "vllm_env.txt" + if not env_path.exists(): + return None + df = _parse_vllm_env_txt(env_path) + return df + + # ----------------------------- # Valid max concurrency summary helpers # ----------------------------- @@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html( summary_df = pd.DataFrame(rows) - # --- Coerce numeric columns so Styler doesn't miss them due to object dtype --- for c in summary_df.columns: if c == "Configuration": continue @@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html( both_col = f"Max {conc_col} (Both)" - # --- Strict 2-decimal formatting for ALL non-Configuration columns --- formatters = {} for c in summary_df.columns: if c == "Configuration": continue - # default argument binds per-column formatter correctly formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" styler = summary_df.style.format(formatters) @@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html( return title + styler.to_html(table_attributes='border="1" class="dataframe"') +def build_valid_max_concurrency_summary_df( + tput_group_df: pd.DataFrame | None, + ttft_group_df: pd.DataFrame | None, + tpot_group_df: pd.DataFrame | None, + conc_col: str, + args, +) -> pd.DataFrame | None: + if ttft_group_df is None and tpot_group_df is None: + return None + + ttft_cols = ( + _config_value_columns(ttft_group_df, conc_col) + if ttft_group_df is not None + else [] + ) + tpot_cols = ( + _config_value_columns(tpot_group_df, conc_col) + if tpot_group_df is not None + else [] + ) + tput_cols = ( + _config_value_columns(tput_group_df, conc_col) + if tput_group_df is not None + else [] + ) + + if ttft_group_df is not None and tpot_group_df is not None: + cfg_cols = [c for c in ttft_cols if c in tpot_cols] + if tput_group_df is not None: + cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols + else: + cfg_cols = ttft_cols or tpot_cols + + if not cfg_cols: + cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) + + rows = [] + for cfg in cfg_cols: + ttft_max = ( + _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) + if ttft_group_df is not None + else pd.NA + ) + tpot_max = ( + _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) + if tpot_group_df is not None + else pd.NA + ) + both = ( + pd.NA + if (pd.isna(ttft_max) or pd.isna(tpot_max)) + else min(ttft_max, tpot_max) + ) + + tput_at_both = ( + _value_at_concurrency(tput_group_df, conc_col, cfg, both) + if tput_group_df is not None + else pd.NA + ) + ttft_at_both = ( + _value_at_concurrency(ttft_group_df, conc_col, cfg, both) + if ttft_group_df is not None + else pd.NA + ) + tpot_at_both = ( + _value_at_concurrency(tpot_group_df, conc_col, cfg, both) + if tpot_group_df is not None + else pd.NA + ) + + rows.append( + { + "Configuration": cfg, + f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, + f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, + f"Max {conc_col} (Both)": both, + "Output Tput @ Both (tok/s)": tput_at_both, + "TTFT @ Both (ms)": ttft_at_both, + "TPOT @ Both (ms)": tpot_at_both, + } + ) + + df = pd.DataFrame(rows) + for c in df.columns: + if c != "Configuration": + df[c] = pd.to_numeric(df[c], errors="coerce") + return df + + # ----------------------------- # Plot helper # ----------------------------- @@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser: default=100.0, help="Reference limit for TPOT plots (ms)", ) + + # ---- NEW: export options ---- + parser.add_argument( + "--excel-out", + type=str, + default="perf_comparison.xlsx", + help="Write one sheet per (Model, Dataset, Input Len, Output Len).", + ) + parser.add_argument( + "--csv-out-dir", + type=str, + default="", + help="If set, write per-group per-metric CSVs into this directory.", + ) + return parser @@ -657,7 +885,6 @@ def maybe_write_plot( markers=True, ) - # Ensure plot hover + y tick labels are also 2 decimals. fig.update_traces(hovertemplate="%{y:.2f}") fig.update_yaxes(tickformat=".2f") @@ -730,87 +957,151 @@ def write_report_group_first( for metric_label, (df, _) in metric_cache.items() } - with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: - main_fh.write('\n') - for gkey in group_keys: - gkey_tuple = normalize_group_key(gkey) - suffix = build_group_suffix(group_cols_canonical, gkey_tuple) - sub_path = group_filename(gkey_tuple) - group_header = ( - '
' - f"{_html.escape(suffix)}" - "
\n" - ) + csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None + if csv_dir: + csv_dir.mkdir(parents=True, exist_ok=True) - main_fh.write(group_header) - with open(sub_path, "w", encoding="utf-8") as sub_fh: - sub_fh.write('\n') - sub_fh.write(group_header) - tput_group_df = None - ttft_group_df = None - tpot_group_df = None - conc_col = args.xaxis + excel_path = args.excel_out or "perf_comparison.xlsx" + with pd.ExcelWriter(excel_path, engine="openpyxl") as xw: + # ---- Environment sheet (first) ---- + env_sheet = _sanitize_sheet_name("Environment") + env_df = _load_env_df_for_inputs(args, files) + if env_df is None or env_df.empty: + pd.DataFrame( + [ + { + "Section": "Environment", + "Key": "vllm_env.txt", + "Value": "NOT FOUND (or empty)", + } + ] + ).to_excel(xw, sheet_name=env_sheet, index=False) + else: + env_df.to_excel(xw, sheet_name=env_sheet, index=False) + with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: + main_fh.write('\n') + for gkey in group_keys: + gkey_tuple = normalize_group_key(gkey) + suffix = build_group_suffix(group_cols_canonical, gkey_tuple) + sub_path = group_filename(gkey_tuple) + group_header = ( + '
' + f"{_html.escape(suffix)}" + "
\n" + ) - for metric_label in plan.data_cols: - gb = metric_groupbys[metric_label] - df_sorted, raw_data_cols = metric_cache[metric_label] + main_fh.write(group_header) - try: - group_df = gb.get_group(gkey) - except KeyError: - missing = ( - '
' - f"{_html.escape(metric_label)} — missing for this group" - "
\n" + sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple) + sheet_base = sheet + dedup_i = 1 + while sheet in xw.sheets: + dedup_i += 1 + sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}") + + excel_blocks: list[tuple[str, pd.DataFrame]] = [] + + with open(sub_path, "w", encoding="utf-8") as sub_fh: + sub_fh.write('\n') + sub_fh.write(group_header) + tput_group_df = None + ttft_group_df = None + tpot_group_df = None + conc_col = args.xaxis + + for metric_label in plan.data_cols: + gb = metric_groupbys[metric_label] + df_sorted, raw_data_cols = metric_cache[metric_label] + + try: + group_df = gb.get_group(gkey) + except KeyError: + missing = ( + '
' + f"{_html.escape(metric_label)} — missing for this group" + "
\n" + ) + main_fh.write(missing) + sub_fh.write(missing) + continue + + if conc_col not in group_df.columns: + conc_col = _find_concurrency_col(group_df) + + mn = metric_label.lower().strip() + if "tok/s" in mn: + tput_group_df = group_df + elif "ttft" in mn: + ttft_group_df = group_df + elif mn in ("p99", "median") or "tpot" in mn: + tpot_group_df = group_df + + display_group = group_df.drop( + columns=group_cols_canonical, errors="ignore" ) - main_fh.write(missing) - sub_fh.write(missing) - continue + html = render_metric_table_html( + display_group, metric_label, suffix, args + ) + main_fh.write(html) + sub_fh.write(html) - if conc_col not in group_df.columns: - conc_col = _find_concurrency_col(group_df) + maybe_write_plot( + main_fh, + sub_fh, + group_df=group_df, + raw_data_cols=raw_data_cols, + metric_label=metric_label, + y_axis_col=y_axis_col, + args=args, + ) - mn = metric_label.lower().strip() - if "tok/s" in mn: - tput_group_df = group_df - elif "ttft" in mn: - ttft_group_df = group_df - elif mn in ("p99", "median") or "tpot" in mn: - tpot_group_df = group_df + excel_blocks.append( + (metric_label, display_group.reset_index(drop=True)) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__{metric_label}".replace(" ", "_").replace( + "/", "_" + ) + ) + display_group.to_csv(csv_dir / f"{fn}.csv", index=False) - display_group = group_df.drop( - columns=group_cols_canonical, errors="ignore" - ) - - html = render_metric_table_html( - display_group, metric_label, suffix, args - ) - main_fh.write(html) - sub_fh.write(html) - - maybe_write_plot( - main_fh, - sub_fh, - group_df=group_df, - raw_data_cols=raw_data_cols, - metric_label=metric_label, - y_axis_col=y_axis_col, + summary_html = build_valid_max_concurrency_summary_html( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, args=args, ) + if summary_html: + main_fh.write(summary_html) + sub_fh.write(summary_html) - summary_html = build_valid_max_concurrency_summary_html( - tput_group_df=tput_group_df, - ttft_group_df=ttft_group_df, - tpot_group_df=tpot_group_df, - conc_col=conc_col, - args=args, - ) - if summary_html: - main_fh.write(summary_html) - sub_fh.write(summary_html) + summary_df = build_valid_max_concurrency_summary_df( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, + args=args, + ) + if summary_df is not None: + excel_blocks.append( + ("Valid Max Concurrency Summary", summary_df) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__Valid_Max_Concurrency_Summary" + ) + summary_df.to_csv(csv_dir / f"{fn}.csv", index=False) + + _write_tables_to_excel_sheet(xw, sheet, excel_blocks) + + print(f"Wrote Excel: {excel_path}") + if csv_dir: + print(f"Wrote CSVs under: {csv_dir}") def main(): diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index d62c01bc7..7dabcf517 100755 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -1,6 +1,4 @@ #!/bin/bash - -# This script should be run inside the CI process # This script assumes that we are already inside the vllm/ directory # Benchmarking results will be available inside vllm/benchmarks/results/ @@ -9,6 +7,11 @@ set -x set -o pipefail +# Environment-driven debug controls (like ON_CPU=1) +DRY_RUN="${DRY_RUN:-0}" +MODEL_FILTER="${MODEL_FILTER:-}" +DTYPE_FILTER="${DTYPE_FILTER:-}" + check_gpus() { if command -v nvidia-smi; then # check the number of GPUs and GPU type. @@ -112,13 +115,12 @@ json2envs() { } wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes local timeout_val="1200" timeout "$timeout_val" bash -c ' - until curl -X POST localhost:8000/v1/completions; do + until curl -sf http://localhost:8000/v1/models >/dev/null; do sleep 1 - done' && return 0 || return 1 + done + ' } kill_processes_launched_by_current_bash() { @@ -252,37 +254,16 @@ run_benchmark_tests() { done } -run_latency_tests() { - run_benchmark_tests "latency" "$1" -} +run_latency_tests() { run_benchmark_tests "latency" "$1"; } +run_startup_tests() { run_benchmark_tests "startup" "$1"; } +run_throughput_tests() { run_benchmark_tests "throughput" "$1"; } -run_startup_tests() { - run_benchmark_tests "startup" "$1" -} - -run_throughput_tests() { - run_benchmark_tests "throughput" "$1" -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - # - # Supported JSON formats: - # 1) Plain format: top-level array - # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # - # 2) Default parameters field + plain format tests - # { - # "defaults": { ... }, - # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # } - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c ' +merge_serving_tests_stream() { + # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode. + # This helper does NOT modify JSON; it only filters the stream in dry-run mode. + local serving_test_file="$1" + # shellcheck disable=SC2016 + local merged=' if type == "array" then # Plain format: test cases array .[] @@ -304,7 +285,50 @@ run_serving_tests() { else error("Unsupported serving test file format: must be array or object with .tests") end - ' "$serving_test_file" | while read -r params; do + ' + + jq -c "$merged" "$serving_test_file" | \ + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" ' + select((($model|length)==0) + or ((.server_parameters.model // "") == $model) + or ((.client_parameters.model // "") == $model)) + | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype)) + ' + else + cat + fi +} + +run_serving_tests() { + # run serving tests using `vllm bench serve` command + # $1: a json file specifying serving test cases + # + # Supported JSON formats: + # 1) Plain format: top-level array + # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # + # 2) Default parameters field + plain format tests + # { + # "defaults": { ... }, + # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # } + + local serving_test_file + serving_test_file=$1 + + # In dry-run mode, if filters are provided but no tests match, fail fast. + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + local count + count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ') + if [[ "$count" -eq 0 ]]; then + echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2 + return 0 + fi + fi + + # Iterate over serving tests (merged + optional filtered stream) + merge_serving_tests_stream "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') if [[ ! "$test_name" =~ ^serving_ ]]; then @@ -373,7 +397,7 @@ run_serving_tests() { echo "Server command: $server_command" # support remote vllm server client_remote_args="" - if [[ -z "${REMOTE_HOST}" ]]; then + if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then bash -c "$server_command" & server_pid=$! # wait until the server is alive @@ -384,6 +408,9 @@ run_serving_tests() { echo "" echo "vLLM failed to start within the timeout period." fi + elif [[ "${DRY_RUN:-0}" == "1" ]]; then + # dry-run: don't start server + echo "Dry Run." else server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" if [[ ${REMOTE_PORT} ]]; then @@ -402,9 +429,7 @@ run_serving_tests() { for qps in $qps_list; do # remove the surrounding single quote from qps if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" qps="inf" - echo "now qps is $qps" fi # iterate over different max_concurrency @@ -425,7 +450,9 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - bash -c "$client_command" + if [[ "${DRY_RUN:-0}" != "1" ]]; then + bash -c "$client_command" + fi # record the benchmarking commands jq_output=$(jq -n \ @@ -443,12 +470,15 @@ run_serving_tests() { done # clean up - kill -9 $server_pid - kill_gpu_processes + if [[ "${DRY_RUN:-0}" != "1" ]]; then + kill -9 $server_pid + kill_gpu_processes + fi done } main() { + local ARCH ARCH='' if [[ "$ON_CPU" == "1" ]]; then @@ -458,7 +488,13 @@ main() { check_gpus ARCH="$arch_suffix" fi - check_hf_token + + # DRY_RUN does not execute vLLM; do not require HF_TOKEN. + if [[ "${DRY_RUN:-0}" != "1" ]]; then + check_hf_token + else + echo "DRY_RUN=1 -> skip HF_TOKEN validation" + fi # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) @@ -479,11 +515,16 @@ main() { # dump vllm info via vllm collect-env env_output=$(vllm collect-env) - echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $? + + if [[ "${DRY_RUN:-0}" == "1" ]]; then + echo "DRY_RUN=1 -> skip latency/startup/throughput suites" + exit 0 + fi + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json new file mode 100644 index 000000000..6d3455c47 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json @@ -0,0 +1,41 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 32, + 64, + 128 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "dtype": "bfloat16", + "model": "jinaai/jina-embeddings-v3", + "trust_remote_code": "" + }, + "client_parameters": { + "model": "jinaai/jina-embeddings-v3", + "backend": "openai-embeddings", + "endpoint": "/v1/embeddings", + "dataset_name": "sharegpt", + "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_jina_embed_v3_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": {} + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json new file mode 100644 index 000000000..25ed7415e --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -0,0 +1,283 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama3B_tp1_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_granite2B_tp1_random_128_128", + "server_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen1.7B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-1.7B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-1.7B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen4B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-4B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-4B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen8B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_glm9B_tp1_random_128_128", + "server_parameters": { + "model": "zai-org/glm-4-9b-hf", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "zai-org/glm-4-9b-hf", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_gemma7B_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-7b", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "google/gemma-7b", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 25ed7415e..e34ddcb6d 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -148,136 +148,6 @@ "random-input-len": 2048, "random-output-len": 128 } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 2 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 4 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama3B_tp1_random_128_128", - "server_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_granite2B_tp1_random_128_128", - "server_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen1.7B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-1.7B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen4B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-4B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen8B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-8B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_glm9B_tp1_random_128_128", - "server_parameters": { - "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "zai-org/glm-4-9b-hf", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_gemma7B_tp1_random_128_128", - "server_parameters": { - "model": "google/gemma-7b", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "google/gemma-7b", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } } ] } diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index aaa9b28ab..431de0d6a 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se ### How to find benchmark configuration examples for supported CPU models? -For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json) +For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in cpu test cases as serving-tests-cpu.json. Full test cases for Text-only models, Multi-Modal models and Embedded models are in cpu Text-Only test cases as serving-tests-cpu-text.json, cpu Multi-Modal test cases as serving-tests-cpu-multimodal.json and cpu Embedded test cases as serving-tests-cpu-embed.json. For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details). To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment. @@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}' For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu) , which publishes default-model CPU results produced using the same Benchmark Suite. +#### Dry-Run + +For users only need to get the optimized runtime configurations without running benchmark, a Dry-Run mode is provided. +By passing an environment variable DRY_RUN=1 with run-performance-benchmarks.sh, +all commands will be generated under `./benchmark/results/`. + +```bash +ON_CPU=1 DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + +By providing different JSON file, users can get runtime configurations for different models such as Embedded Models. + +```bash +ON_CPU=1 SERVING_JSON=serving-tests-cpu-embed.json DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + +By providing MODEL_FILTER and DTYPE_FILTER, only commands for related model ID and Data Type will be generated. + +```bash +ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-llama/Llama-3.1-8B-Instruct DTYPE_FILTER=bfloat16 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + ### How to decide `VLLM_CPU_OMP_THREADS_BIND`? - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.