Vllm CPU benchmark suite improvement (#34128)

Signed-off-by: louie-tsai <louie.tsai@intel.com>
2026-02-12 00:04:44 -08:00
parent 386bfe5d08
commit 55a1a9563a
6 changed files with 799 additions and 251 deletions
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
-
-# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -9,6 +7,11 @@
 set -x
 set -o pipefail

+# Environment-driven debug controls (like ON_CPU=1)
+DRY_RUN="${DRY_RUN:-0}"
+MODEL_FILTER="${MODEL_FILTER:-}"
+DTYPE_FILTER="${DTYPE_FILTER:-}"
+
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -112,13 +115,12 @@ json2envs() {
 }

 wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
+    until curl -sf http://localhost:8000/v1/models >/dev/null; do
      sleep 1
-    done' && return 0 || return 1
+    done
+  '
 }

 kill_processes_launched_by_current_bash() {
@@ -252,37 +254,16 @@ run_benchmark_tests() {
  done
 }

-run_latency_tests() {
-  run_benchmark_tests "latency" "$1"
-}
+run_latency_tests() { run_benchmark_tests "latency" "$1"; }
+run_startup_tests() { run_benchmark_tests "startup" "$1"; }
+run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }

-run_startup_tests() {
-  run_benchmark_tests "startup" "$1"
-}
-
-run_throughput_tests() {
-  run_benchmark_tests "throughput" "$1"
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '
+merge_serving_tests_stream() {
+  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
+  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
+  local serving_test_file="$1"
+  # shellcheck disable=SC2016
+  local merged='
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -304,7 +285,50 @@ run_serving_tests() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  ' "$serving_test_file" | while read -r params; do
+  '
+
+  jq -c "$merged" "$serving_test_file" | \
+  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
+      select((($model|length)==0)
+             or ((.server_parameters.model // "") == $model)
+             or ((.client_parameters.model // "") == $model))
+      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
+    '
+  else
+    cat
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # In dry-run mode, if filters are provided but no tests match, fail fast.
+  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
+    local count
+    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
+    if [[ "$count" -eq 0 ]]; then
+      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
+      return 0
+    fi
+  fi
+
+  # Iterate over serving tests (merged + optional filtered stream)
+  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -373,7 +397,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" ]]; then
+    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -384,6 +408,9 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
+    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
+        # dry-run: don't start server
+        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -402,9 +429,7 @@ run_serving_tests() {
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
        qps="inf"
-        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
@@ -425,7 +450,9 @@ run_serving_tests() {
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        bash -c "$client_command"
+        if [[ "${DRY_RUN:-0}" != "1" ]]; then
+          bash -c "$client_command"
+        fi

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -443,12 +470,15 @@ run_serving_tests() {
    done

    # clean up
-    kill -9 $server_pid
-    kill_gpu_processes
+    if [[ "${DRY_RUN:-0}" != "1" ]]; then
+      kill -9 $server_pid
+      kill_gpu_processes
+    fi
  done
 }

 main() {
+
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -458,7 +488,13 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-  check_hf_token
+
+  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    check_hf_token
+  else
+    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
+  fi

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -479,11 +515,16 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
-
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
+
+  if [[ "${DRY_RUN:-0}" == "1" ]]; then
+    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
+    exit 0
+  fi
+
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"