more models for vLLM Benchmark Suite (#35086)

Signed-off-by: louie-tsai <louie.tsai@intel.com>
2026-03-11 20:36:51 -07:00
parent 8647c6cf51
commit 17852aa503
8 changed files with 801 additions and 120 deletions
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"

+# Adaptive search controls
+ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
+SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
+SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
+ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
+ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
+
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

+# -------------------------------
+# Adaptive concurrency helpers
+# -------------------------------
+result_json_path_for_serving() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
+}
+
+extract_metric_ms() {
+  local metric_name=$1
+  local json_file=$2
+
+  [[ -f "$json_file" ]] || return 0
+
+  if [[ "$metric_name" == "ttft" ]]; then
+    jq -r '
+      [
+        .ttft_ms.p99?,
+        .metrics.ttft_ms.p99?,
+        .ttft.p99?,
+        .metrics.ttft.p99?,
+        .p99_ttft_ms?,
+        .ttft_ms.mean?,
+        .metrics.ttft_ms.mean?,
+        .ttft.mean?,
+        .metrics.ttft.mean?,
+        .mean_ttft_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  else
+    jq -r '
+      [
+        .tpot_ms.p99?,
+        .metrics.tpot_ms.p99?,
+        .tpot.p99?,
+        .metrics.tpot.p99?,
+        .p99_tpot_ms?,
+        .itl_ms.p99?,
+        .metrics.itl_ms.p99?,
+        .inter_token_latency_ms.p99?,
+        .tpot_ms.mean?,
+        .metrics.tpot_ms.mean?,
+        .tpot.mean?,
+        .metrics.tpot.mean?,
+        .itl_ms.mean?,
+        .metrics.itl_ms.mean?,
+        .mean_tpot_ms?,
+        .mean_itl_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  fi
+}
+
+evaluate_sla_from_json() {
+  local json_file=$1
+  local ttft
+  local tpot
+  local pass
+
+  [[ -f "$json_file" ]] || return 2
+
+  ttft=$(extract_metric_ms ttft "$json_file")
+  tpot=$(extract_metric_ms tpot "$json_file")
+
+  [[ -n "$ttft" && -n "$tpot" ]] || return 2
+
+  pass=$(jq -n \
+    --argjson ttft "$ttft" \
+    --argjson tpot "$tpot" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
+
+  [[ "$pass" == "true" ]]
+}
+
+write_adaptive_summary_json() {
+  local summary_file=$1
+  local test_name=$2
+  local qps=$3
+  local static_last_pass=$4
+  local static_first_fail=$5
+  local final_last_pass=$6
+  local final_first_fail=$7
+
+  jq -n \
+    --arg test_name "$test_name" \
+    --arg qps "$qps" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    --arg static_last_pass "${static_last_pass:-}" \
+    --arg static_first_fail "${static_first_fail:-}" \
+    --arg final_last_pass "${final_last_pass:-}" \
+    --arg final_first_fail "${final_first_fail:-}" \
+    '{
+      test_name: $test_name,
+      qps: $qps,
+      sla_ttft_ms: $sla_ttft,
+      sla_tpot_ms: $sla_tpot,
+      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
+      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
+      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
+      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
+    }' > "$summary_file"
+}
+
+run_single_serving_probe() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+  local result_json
+  local num_prompts_arg=""
+  local client_command
+
+  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
+
+  if [[ -f "$result_json" ]]; then
+    evaluate_sla_from_json "$result_json"
+    return $?
+  fi
+
+  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+    num_prompts_arg="--num-prompts $num_prompts"
+  fi
+
+  client_command="vllm bench serve \
+    --save-result \
+    --result-dir $RESULTS_FOLDER \
+    --result-filename ${new_test_name}.json \
+    --request-rate $qps \
+    --max-concurrency $max_concurrency \
+    $num_prompts_arg \
+    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
+    $client_args_effective $client_remote_args "
+
+  echo "Adaptive probe: $client_command"
+
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    bash -c "$client_command"
+  fi
+
+  jq_output=$(jq -n \
+    --arg server "$server_command" \
+    --arg client "$client_command" \
+    --arg gpu "$gpu_type" \
+    '{
+      server_command: $server,
+      client_command: $client,
+      gpu_type: $gpu,
+      adaptive_search: true
+    }')
+  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+
+  evaluate_sla_from_json "$result_json"
+}
+
+adaptive_refine_from_static_results() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency_list_raw=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local sorted_points
+  local point
+  local rc
+  local static_last_pass=""
+  local static_first_fail=""
+  local largest_static=""
+  local step_hint=1
+  local previous_point=""
+  local low
+  local high
+  local mid
+  local probes=0
+  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
+
+  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
+  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
+
+  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
+  [[ -n "$sorted_points" ]] || return 0
+
+  while read -r point; do
+    [[ -z "$point" ]] && continue
+    largest_static="$point"
+    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
+    rc=$?
+    if (( rc == 0 )); then
+      static_last_pass="$point"
+    elif (( rc == 1 )); then
+      if [[ -n "$static_last_pass" ]]; then
+        static_first_fail="$point"
+        break
+      fi
+    fi
+
+    if [[ -n "$previous_point" ]]; then
+      step_hint=$(( point - previous_point ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    fi
+    previous_point="$point"
+  done <<< "$sorted_points"
+
+  if [[ -z "$static_last_pass" ]]; then
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
+    return 0
+  fi
+
+  if [[ -n "$static_first_fail" ]]; then
+    low=$static_last_pass
+    high=$static_first_fail
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
+    return 0
+  fi
+
+  low=$largest_static
+  high=""
+  while (( probes < ADAPTIVE_MAX_PROBES )); do
+    point=$(( low + step_hint ))
+    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
+      point=$ADAPTIVE_MAX_CONCURRENCY
+    fi
+    (( point > low )) || break
+    probes=$(( probes + 1 ))
+    run_single_serving_probe \
+      "$test_name" "$qps" "$point" "$tp" \
+      "$compilation_config_mode" "$optimization_level" \
+      "$client_args_effective" "$client_remote_args" "$server_command"
+    rc=$?
+    if (( rc == 0 )); then
+      low=$point
+      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
+      step_hint=$(( step_hint * 2 ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    elif (( rc == 1 )); then
+      high=$point
+      break
+    else
+      break
+    fi
+  done
+
+  if [[ -n "$high" ]]; then
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+  fi
+
+  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
+}
+
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')

-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
+    server_model=$(echo "$server_params" | jq -r '.model // empty')
+    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
+      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
+      exit 1
+    fi
+    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
+    server_args=$(json2args "$server_params_no_model")
+
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")

+    # ------------------------------------------------------------
+    # Option 1: Dynamic num-prompts scaling based on max_concurrency
+    #
+    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
+    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
+    #
+    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
+    # unchanged (i.e., whatever is in serving-tests-*.json).
+    # ------------------------------------------------------------
+    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
+    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
+    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
+
+    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Handles: --num-prompts 123   and   --num-prompts=123
+      client_args_no_np="$(
+        printf ' %s ' "$client_args" \
+        | sed -E \
+          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
+          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
+      )"
+      # normalize whitespace
+      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
+      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
+      client_args_effective="$client_args_no_np"
+    else
+      client_args_effective="$client_args"
+    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi

    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"

    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
+        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
+        num_prompts_arg=""
+        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+          num_prompts_arg="--num-prompts $num_prompts"
+        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
+          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
+
+      adaptive_refine_from_static_results \
+        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
    done

    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json

  upload_to_buildkite
 }