2025-05-01 12:53:03 -07:00
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
# It also supports additional requirement: e2e latency and prefix cache.
# Pre-requisite:
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
# 2. If the model is customized, replace the MODEL's config with the customized config.
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
2025-06-06 16:31:19 -07:00
# TP: ways of tensor parallelism
2025-05-01 12:53:03 -07:00
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
2025-06-06 16:31:19 -07:00
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
2025-05-01 12:53:03 -07:00
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
# Example use cases
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
TAG = $( date +"%Y_%m_%d_%H_%M" )
BASE = ""
MODEL = "meta-llama/Llama-3.1-8B-Instruct"
2025-06-06 16:31:19 -07:00
TP = 1
2025-05-01 12:53:03 -07:00
DOWNLOAD_DIR = ""
INPUT_LEN = 4000
OUTPUT_LEN = 16
2025-06-06 16:31:19 -07:00
MIN_CACHE_HIT_PCT = 0
2025-05-01 12:53:03 -07:00
MAX_LATENCY_ALLOWED_MS = 100000000000
2025-06-06 16:31:19 -07:00
NUM_SEQS_LIST = "128 256"
NUM_BATCHED_TOKENS_LIST = "512 1024 2048 4096"
2025-05-01 12:53:03 -07:00
LOG_FOLDER = " $BASE /auto-benchmark/ $TAG "
RESULT = " $LOG_FOLDER /result.txt "
2025-06-06 16:31:19 -07:00
echo " result file: $RESULT "
2025-05-01 12:53:03 -07:00
echo " model: $MODEL "
rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER
cd " $BASE /vllm "
2025-06-06 16:31:19 -07:00
pip install -q datasets
2025-05-01 12:53:03 -07:00
current_hash = $( git rev-parse HEAD)
echo " hash: $current_hash " >> " $RESULT "
echo " current_hash: $current_hash "
best_throughput = 0
best_max_num_seqs = 0
best_num_batched_tokens = 0
best_goodput = 0
2025-06-06 16:31:19 -07:00
start_server( ) {
local gpu_memory_utilization = $1
local max_num_seqs = $2
local max_num_batched_tokens = $3
local vllm_log = $4
pkill -f vllm
2025-05-01 12:53:03 -07:00
VLLM_USE_V1 = 1 VLLM_SERVER_DEV_MODE = 1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
2025-06-06 16:31:19 -07:00
--gpu-memory-utilization $gpu_memory_utilization \
2025-05-01 12:53:03 -07:00
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
2025-06-06 16:31:19 -07:00
--tensor-parallel-size $TP \
2025-05-01 12:53:03 -07:00
--enable-prefix-caching \
--load-format dummy \
2025-06-06 16:31:19 -07:00
--download-dir " $DOWNLOAD_DIR " \
2025-05-01 12:53:03 -07:00
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > " $vllm_log " 2>& 1 &
2025-06-06 16:31:19 -07:00
2025-05-01 12:53:03 -07:00
# wait for 10 minutes...
server_started = 0
2025-06-06 16:31:19 -07:00
for i in { 1..60} ; do
RESPONSE = $( curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE = $( echo " $RESPONSE " | tail -n 1)
if [ [ " $STATUS_CODE " -eq 200 ] ] ; then
2025-05-01 12:53:03 -07:00
server_started = 1
break
else
sleep 10
fi
done
if ( ( ! server_started ) ) ; then
2025-06-06 16:31:19 -07:00
echo " server did not start within 10 minutes. Please check server log at $vllm_log " .
2025-05-01 12:53:03 -07:00
return 1
2025-06-06 16:31:19 -07:00
else
return 0
2025-05-01 12:53:03 -07:00
fi
2025-06-06 16:31:19 -07:00
}
run_benchmark( ) {
local max_num_seqs = $1
local max_num_batched_tokens = $2
local gpu_memory_utilization = $3
echo " max_num_seq: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
local vllm_log = " $LOG_FOLDER /vllm_log_ ${ max_num_seqs } _ ${ max_num_batched_tokens } .txt "
echo " vllm_log: $vllm_log "
echo
rm -f $vllm_log
pkill -f vllm
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result = $?
if [ [ " $result " -eq 1 ] ] ; then
echo " server failed to start. gpu_memory_utilization: $gpu_memory_utilization , max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
else
echo "server started."
fi
echo
2025-05-01 12:53:03 -07:00
echo "run benchmark test..."
meet_latency_requirement = 0
# get a basic qps by using request-rate inf
bm_log = " $LOG_FOLDER /bm_log_ ${ max_num_seqs } _ ${ max_num_batched_tokens } _requestrate_inf.txt "
prefix_len = $(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
2025-06-06 16:31:19 -07:00
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
2025-05-01 12:53:03 -07:00
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
2025-06-06 16:31:19 -07:00
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 & > " $bm_log "
throughput = $( grep "Request throughput (req/s):" " $bm_log " | sed 's/[^0-9.]//g' )
2025-05-01 12:53:03 -07:00
e2el = $( grep "P99 E2EL (ms):" " $bm_log " | awk '{print $NF}' )
goodput = $( grep "Request goodput (req/s):" " $bm_log " | sed 's/[^0-9.]//g' )
if ( ( $( echo " $e2el <= $MAX_LATENCY_ALLOWED_MS " | bc -l) ) ) ; then
meet_latency_requirement = 1
2025-06-06 16:31:19 -07:00
request_rate = inf
2025-05-01 12:53:03 -07:00
fi
if ( ( ! meet_latency_requirement ) ) ; then
2025-06-06 16:31:19 -07:00
# start from request-rate as int(throughput) + 1
request_rate = $(( ${ throughput %.* } + 1 ))
2025-05-01 12:53:03 -07:00
while ( ( request_rate > 0) ) ; do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log = " $LOG_FOLDER /bm_log_ ${ max_num_seqs } _ ${ max_num_batched_tokens } _requestrate_ ${ request_rate } .txt "
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
2025-06-06 16:31:19 -07:00
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
2025-05-01 12:53:03 -07:00
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
2025-06-06 16:31:19 -07:00
--random-prefix-len $prefix_len \
--port 8004 & > " $bm_log "
throughput = $( grep "Request throughput (req/s):" " $bm_log " | sed 's/[^0-9.]//g' )
2025-05-01 12:53:03 -07:00
e2el = $( grep "P99 E2EL (ms):" " $bm_log " | awk '{print $NF}' )
goodput = $( grep "Request goodput (req/s):" " $bm_log " | sed 's/[^0-9.]//g' )
if ( ( $( echo " $e2el <= $MAX_LATENCY_ALLOWED_MS " | bc -l) ) ) ; then
meet_latency_requirement = 1
break
fi
request_rate = $(( request_rate-1))
done
fi
# write the results and update the best result.
if ( ( meet_latency_requirement) ) ; then
2025-06-06 16:31:19 -07:00
echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens , request_rate: $request_rate , e2el: $e2el , throughput: $throughput , goodput: $goodput "
echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens , request_rate: $request_rate , e2el: $e2el , throughput: $throughput , goodput: $goodput " >> " $RESULT "
if ( ( $( echo " $throughput > $best_throughput " | bc -l) ) ) ; then
best_throughput = $throughput
2025-05-01 12:53:03 -07:00
best_max_num_seqs = $max_num_seqs
best_num_batched_tokens = $max_num_batched_tokens
best_goodput = $goodput
fi
else
echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${ MAX_LATENCY_ALLOWED_MS } "
echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${ MAX_LATENCY_ALLOWED_MS } " >> " $RESULT "
fi
echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput "
pkill vllm
sleep 10
printf '=%.0s' $( seq 1 20)
return 0
}
2025-06-06 16:31:19 -07:00
read -r -a num_seqs_list <<< " $NUM_SEQS_LIST "
read -r -a num_batched_tokens_list <<< " $NUM_BATCHED_TOKENS_LIST "
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization = 0.98
find_gpu_memory_utilization = 0
while ( ( $( echo " $gpu_memory_utilization >= 0.9 " | bc -l) ) ) ; do
start_server $gpu_memory_utilization " ${ num_seqs_list [-1] } " " ${ num_batched_tokens_list [-1] } " " $LOG_FOLDER /vllm_log_gpu_memory_utilization_ $gpu_memory_utilization .log "
result = $?
if [ [ " $result " -eq 0 ] ] ; then
find_gpu_memory_utilization = 1
break
else
gpu_memory_utilization = $( echo " $gpu_memory_utilization - 0.01 " | bc)
fi
done
if [ [ " $find_gpu_memory_utilization " -eq 1 ] ] ; then
echo " Using gpu_memory_utilization= $gpu_memory_utilization to serve model. "
else
echo " Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER . "
exit 1
fi
2025-05-01 12:53:03 -07:00
2025-06-06 16:31:19 -07:00
for num_seqs in " ${ num_seqs_list [@] } " ; do
for num_batched_tokens in " ${ num_batched_tokens_list [@] } " ; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
2025-05-01 12:53:03 -07:00
done
done
echo "finish permutations"
echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput "
echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput " >> " $RESULT "