[TPU][Test] Add script to run benchmark on TPU for buildkite (#19039)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
This commit is contained in:
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
@@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "Error: The env file '$1' does not exist."
|
||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||
fi
|
||||
|
||||
ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
source $ENV_FILE
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build docker image.
|
||||
# TODO: build the image outside the script and share the image with other
|
||||
# tpu test if building time is too long.
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg GIT_REPO_CHECK=0 \
|
||||
--tag vllm/vllm-tpu-bm \
|
||||
--progress plain -f docker/Dockerfile.tpu .
|
||||
|
||||
LOG_ROOT=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $LOG_ROOT"
|
||||
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "Error: HF_TOKEN is not set or is empty."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Make sure mounted disk or dir exists
|
||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Run model $MODEL"
|
||||
echo
|
||||
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name $CONTAINER_NAME \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
-v /dev/shm:/dev/shm \
|
||||
vllm/vllm-tpu-bm tail -f /dev/null
|
||||
|
||||
echo "run script..."
|
||||
echo
|
||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
|
||||
|
||||
echo "copy result back..."
|
||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||
|
||||
if [ "$BUILDKITE" = "true" ]; then
|
||||
echo "Running inside Buildkite"
|
||||
buildkite-agent artifact upload "$VLLM_LOG"
|
||||
buildkite-agent artifact upload "$BM_LOG"
|
||||
else
|
||||
echo "Not running inside Buildkite"
|
||||
fi
|
||||
|
||||
#
|
||||
# compare the throughput with EXPECTED_THROUGHPUT
|
||||
# and assert meeting the expectation
|
||||
#
|
||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||
echo "Failed to get the throughput"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user