diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 6b6a7e472..3f1ea67a4 100755 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -25,9 +25,9 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - + declare -g arch_suffix='' - + if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') elif command -v amd-smi; then @@ -181,19 +181,20 @@ upload_to_buildkite() { $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" } -run_latency_tests() { - # run latency tests using `vllm bench latency` command - # $1: a json file specifying latency test cases +run_benchmark_tests() { + # run benchmark tests using `vllm bench ` command + # $1: test type (latency or throughput) + # $2: a json file specifying test cases - local latency_test_file - latency_test_file=$1 + local test_type=$1 + local test_file=$2 - # Iterate over latency tests - jq -c '.[]' "$latency_test_file" | while read -r params; do + # Iterate over tests + jq -c '.[]' "$test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') - if [[ ! "$test_name" =~ ^latency_ ]]; then - echo "In latency-test.json, test_name must start with \"latency_\"." + if [[ ! "$test_name" =~ ^${test_type}_ ]]; then + echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"." exit 1 fi @@ -204,15 +205,15 @@ run_latency_tests() { fi # get arguments - latency_params=$(echo "$params" | jq -r '.parameters') - latency_args=$(json2args "$latency_params") - latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') - latency_envs=$(json2envs "$latency_environment_variables") + bench_params=$(echo "$params" | jq -r '.parameters') + bench_args=$(json2args "$bench_params") + bench_environment_variables=$(echo "$params" | jq -r '.environment_variables') + bench_envs=$(json2envs "$bench_environment_variables") # check if there is enough GPU to run the test - tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size') if [[ "$ON_CPU" == "1" ]]; then - pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1') + pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -225,97 +226,42 @@ run_latency_tests() { fi fi - latency_command=" $latency_envs vllm bench latency \ + bench_command=" $bench_envs vllm bench $test_type \ --output-json $RESULTS_FOLDER/${test_name}.json \ - $latency_args" + $bench_args" echo "Running test case $test_name" - echo "Latency command: $latency_command" + echo "${test_type^} command: $bench_command" - # recoding benchmarking command ang GPU command + # recording benchmarking command and GPU command jq_output=$(jq -n \ - --arg latency "$latency_command" \ + --arg command "$bench_command" \ --arg gpu "$gpu_type" \ + --arg test_type "$test_type" \ '{ - latency_command: $latency, + ($test_type + "_command"): $command, gpu_type: $gpu }') echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" # run the benchmark - eval "$latency_command" + eval "$bench_command" kill_gpu_processes done } +run_latency_tests() { + run_benchmark_tests "latency" "$1" +} + +run_startup_tests() { + run_benchmark_tests "startup" "$1" +} + run_throughput_tests() { - # run throughput tests using `vllm bench throughput` - # $1: a json file specifying throughput test cases - - local throughput_test_file - throughput_test_file=$1 - - # Iterate over throughput tests - jq -c '.[]' "$throughput_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - if [[ ! "$test_name" =~ ^throughput_ ]]; then - echo "In throughput-test.json, test_name must start with \"throughput_\"." - exit 1 - fi - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # get arguments - throughput_params=$(echo "$params" | jq -r '.parameters') - throughput_args=$(json2args "$throughput_params") - throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables') - throughput_envs=$(json2envs "$throughput_environment_variables") - - # check if there is enough GPU to run the test - tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [[ "$ON_CPU" == "1" ]]; then - pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1') - world_size=$(($tp*$pp)) - if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then - echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." - continue - fi - else - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - fi - - throughput_command=" $throughput_envs vllm bench throughput \ - --output-json $RESULTS_FOLDER/${test_name}.json \ - $throughput_args" - - echo "Running test case $test_name" - echo "Throughput command: $throughput_command" - # recoding benchmarking command ang GPU command - jq_output=$(jq -n \ - --arg command "$throughput_command" \ - --arg gpu "$gpu_type" \ - '{ - throughput_command: $command, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" - - # run the benchmark - eval "$throughput_command" - - kill_gpu_processes - - done + run_benchmark_tests "throughput" "$1" } run_serving_tests() { @@ -534,6 +480,7 @@ main() { # benchmarking run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" + run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" # postprocess benchmarking results