diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 6b6a7e472..3f1ea67a4 100755
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -25,9 +25,9 @@ check_gpus() {
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  
+
   declare -g arch_suffix=''
-  
+
   if command -v nvidia-smi; then
     declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
   elif command -v amd-smi; then
@@ -181,19 +181,20 @@ upload_to_buildkite() {
   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
-run_latency_tests() {
-  # run latency tests using `vllm bench latency` command
-  # $1: a json file specifying latency test cases
+run_benchmark_tests() {
+  # run benchmark tests using `vllm bench <test_type>` command
+  # $1: test type (latency or throughput)
+  # $2: a json file specifying test cases
 
-  local latency_test_file
-  latency_test_file=$1
+  local test_type=$1
+  local test_file=$2
 
-  # Iterate over latency tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
+  # Iterate over tests
+  jq -c '.[]' "$test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
+    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
+      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
       exit 1
     fi
 
@@ -204,15 +205,15 @@ run_latency_tests() {
     fi
 
     # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
-    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    latency_envs=$(json2envs "$latency_environment_variables")
+    bench_params=$(echo "$params" | jq -r '.parameters')
+    bench_args=$(json2args "$bench_params")
+    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    bench_envs=$(json2envs "$bench_environment_variables")
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
     if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -225,97 +226,42 @@ run_latency_tests() {
       fi
     fi
 
-    latency_command=" $latency_envs vllm bench latency \
+    bench_command=" $bench_envs vllm bench $test_type \
       --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
+      $bench_args"
 
     echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
+    echo "${test_type^} command: $bench_command"
 
-    # recoding benchmarking command ang GPU command
+    # recording benchmarking command and GPU command
     jq_output=$(jq -n \
-      --arg latency "$latency_command" \
+      --arg command "$bench_command" \
       --arg gpu "$gpu_type" \
+      --arg test_type "$test_type" \
       '{
-        latency_command: $latency,
+        ($test_type + "_command"): $command,
         gpu_type: $gpu
       }')
     echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
 
     # run the benchmark
-    eval "$latency_command"
+    eval "$bench_command"
 
     kill_gpu_processes
 
   done
 }
 
+run_latency_tests() {
+  run_benchmark_tests "latency" "$1"
+}
+
+run_startup_tests() {
+  run_benchmark_tests "startup" "$1"
+}
+
 run_throughput_tests() {
-  # run throughput tests using `vllm bench throughput`
-  # $1: a json file specifying throughput test cases
-
-  local throughput_test_file
-  throughput_test_file=$1
-
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    throughput_envs=$(json2envs "$throughput_environment_variables")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-
-    throughput_command=" $throughput_envs vllm bench throughput \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$throughput_command"
-
-    kill_gpu_processes
-
-  done
+  run_benchmark_tests "throughput" "$1"
 }
 
 run_serving_tests() {
@@ -534,6 +480,7 @@ main() {
   # benchmarking
   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
 
   # postprocess benchmarking results