more models for vLLM Benchmark Suite (#35086)

Signed-off-by: louie-tsai <louie.tsai@intel.com>
This commit is contained in:
Louie Tsai
2026-03-11 20:36:51 -07:00
committed by GitHub
parent 8647c6cf51
commit 17852aa503
8 changed files with 801 additions and 120 deletions

View File

@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() {
local metric_name=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
server_command="$server_envs vllm serve \
server_command="$server_envs vllm serve $server_model \
$server_args"
# run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args $client_remote_args "
$client_args_effective $client_remote_args "
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done
# clean up
@@ -532,6 +888,7 @@ main() {
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite
}