Compare commits
1 Commits
v0.14.0rc0
...
v0.9.2rc2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5dd03c1eb |
@@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@@ -1,24 +0,0 @@
|
|||||||
name: vllm_ci
|
|
||||||
job_dirs:
|
|
||||||
- ".buildkite/test_areas"
|
|
||||||
- ".buildkite/image_build"
|
|
||||||
run_all_patterns:
|
|
||||||
- "docker/Dockerfile"
|
|
||||||
- "CMakeLists.txt"
|
|
||||||
- "requirements/common.txt"
|
|
||||||
- "requirements/cuda.txt"
|
|
||||||
- "requirements/build.txt"
|
|
||||||
- "requirements/test.txt"
|
|
||||||
- "setup.py"
|
|
||||||
- "csrc/"
|
|
||||||
- "cmake/"
|
|
||||||
run_all_exclude_patterns:
|
|
||||||
- "docker/Dockerfile."
|
|
||||||
- "csrc/cpu/"
|
|
||||||
- "csrc/rocm/"
|
|
||||||
- "cmake/hipify.py"
|
|
||||||
- "cmake/cpu_extension.cmake"
|
|
||||||
registries: public.ecr.aws/q9t5s3a7
|
|
||||||
repositories:
|
|
||||||
main: "vllm-ci-postmerge-repo"
|
|
||||||
premerge: "vllm-ci-test-repo"
|
|
||||||
27
.buildkite/generate_index.py
Normal file
27
.buildkite/generate_index.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
template = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for vLLM</h1/>
|
||||||
|
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
filename = os.path.basename(args.wheel)
|
||||||
|
|
||||||
|
with open("index.html", "w") as f:
|
||||||
|
print(f"Generated index.html for {args.wheel}")
|
||||||
|
# cloudfront requires escaping the '+' character
|
||||||
|
f.write(
|
||||||
|
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||||
|
)
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 8 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
BRANCH=$4
|
|
||||||
VLLM_USE_PRECOMPILED=$5
|
|
||||||
VLLM_MERGE_BASE_COMMIT=$6
|
|
||||||
CACHE_FROM=$7
|
|
||||||
CACHE_TO=$8
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
|
||||||
|
|
||||||
# docker buildx
|
|
||||||
docker buildx create --name vllm-builder --driver docker-container --use
|
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
docker buildx ls
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
|
||||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
|
||||||
else
|
|
||||||
merge_base_commit_build_args=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker buildx build --file docker/Dockerfile \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
|
||||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
|
||||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
|
||||||
${merge_base_commit_build_args} \
|
|
||||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
|
||||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
|
||||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
|
||||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
|
||||||
--push \
|
|
||||||
--target test \
|
|
||||||
--progress plain .
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
group: Abuild
|
|
||||||
steps:
|
|
||||||
- label: ":docker: Build image"
|
|
||||||
key: image-build
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU image"
|
|
||||||
key: image-build-cpu
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build HPU image"
|
|
||||||
soft_fail: true
|
|
||||||
depends_on: []
|
|
||||||
key: image-build-hpu
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU arm64 image"
|
|
||||||
key: cpu-arm64-image-build
|
|
||||||
depends_on: []
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
|
||||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
|
||||||
--build-arg VLLM_CPU_AMXBF16=true \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build \
|
|
||||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
|
||||||
--progress plain \
|
|
||||||
https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
|
||||||
value: 0.80
|
|
||||||
limit: 100
|
|
||||||
num_fewshot: 0
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.80
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
rtol: 0.05
|
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.595
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.582
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size)
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
|
||||||
|
|
||||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.855
|
|
||||||
limit: 2500
|
|
||||||
num_fewshot: 0
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.82
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
enforce_eager: false # we use false to speed up the eval process
|
|
||||||
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
|
||||||
max_model_len: 40960
|
|
||||||
apply_chat_template: true
|
|
||||||
fewshot_as_multiturn: true
|
|
||||||
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
||||||
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Qwen2.5-VL-7B-Instruct.yaml
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install lm-eval==0.4.9
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our correctness tests in vllm's CI."
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:l:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm-vlm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
|
||||||
--tasks chartqa \
|
|
||||||
--batch_size auto \
|
|
||||||
--apply_chat_template \
|
|
||||||
--limit $LIMIT
|
|
||||||
2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
lm_eval --model vllm \
|
lm_eval --model vllm \
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
|
||||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
--batch_size "$BATCH_SIZE"
|
--batch_size "$BATCH_SIZE"
|
||||||
|
|||||||
@@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size auto
|
|
||||||
@@ -9,78 +9,30 @@ pytest -s -v test_lm_eval_correctness.py \
|
|||||||
--tp-size=1
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
DEFAULT_RTOL = 0.08
|
RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def scoped_env_vars(new_env: dict[str, str]):
|
|
||||||
if not new_env:
|
|
||||||
# Fast path: nothing to do
|
|
||||||
yield
|
|
||||||
return
|
|
||||||
|
|
||||||
old_values = {}
|
|
||||||
new_keys = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
for key, value in new_env.items():
|
|
||||||
if key in os.environ:
|
|
||||||
old_values[key] = os.environ[key]
|
|
||||||
else:
|
|
||||||
new_keys.append(key)
|
|
||||||
os.environ[key] = str(value)
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
# Restore / clean up
|
|
||||||
for key, value in old_values.items():
|
|
||||||
os.environ[key] = value
|
|
||||||
for key in new_keys:
|
|
||||||
os.environ.pop(key, None)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
|
||||||
backend = eval_config.get("backend", "vllm")
|
|
||||||
enforce_eager = eval_config.get("enforce_eager", "true")
|
|
||||||
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager={enforce_eager},"
|
f"enforce_eager=true,"
|
||||||
f"kv_cache_dtype={kv_cache_dtype},"
|
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
f"max_model_len={max_model_len},"
|
)
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
|
model="vllm",
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
|
limit=eval_config["limit"],
|
||||||
|
batch_size="auto",
|
||||||
)
|
)
|
||||||
|
|
||||||
env_vars = eval_config.get("env_vars", None)
|
|
||||||
with scoped_env_vars(env_vars):
|
|
||||||
results = lm_eval.simple_evaluate(
|
|
||||||
model=backend,
|
|
||||||
model_args=model_args,
|
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
|
||||||
limit=eval_config["limit"],
|
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
|
||||||
# text models. however, this is regressing measured strict-match for
|
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
|
||||||
apply_chat_template=eval_config.get(
|
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
|
||||||
),
|
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -89,8 +41,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
|
|
||||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@@ -98,9 +48,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(
|
||||||
f"{task['name']} | {metric['name']}: "
|
f"{task['name']} | {metric['name']}: "
|
||||||
f"ground_truth={ground_truth:.3f} | "
|
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
181
.buildkite/nightly-benchmarks/README.md
Normal file
181
.buildkite/nightly-benchmarks/README.md
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
## Trigger the benchmark
|
||||||
|
|
||||||
|
Performance benchmark will be triggered when:
|
||||||
|
- A PR being merged into vllm.
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
|
Manually Trigger the benchmark
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Runtime environment variables:
|
||||||
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
|
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||||
|
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
|
Nightly benchmark will be triggered when:
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
## Performance benchmark details
|
||||||
|
|
||||||
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
### Latency test
|
||||||
|
|
||||||
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
### Throughput test
|
||||||
|
|
||||||
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
|
### Serving test
|
||||||
|
|
||||||
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside this example:
|
||||||
|
|
||||||
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
### Visualizing the results
|
||||||
|
|
||||||
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||||
|
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||||
|
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||||
|
|
||||||
|
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||||
|
|
||||||
|
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|----|----------------------------------------|----------------------------------------|----------|
|
||||||
|
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||||
|
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||||
|
|
||||||
|
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||||
|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||||
|
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||||
|
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||||
|
|
||||||
|
## Nightly test details
|
||||||
|
|
||||||
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
|
### Workflow
|
||||||
|
|
||||||
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
|
### Nightly tests
|
||||||
|
|
||||||
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
|
### Docker containers
|
||||||
|
|
||||||
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
184
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
184
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Wait for container to be ready"
|
||||||
|
key: wait-for-container-image
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
- image: badouralix/curl-jq
|
||||||
|
command:
|
||||||
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- label: "Cleanup H100"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: ~
|
||||||
|
command: docker system prune -a --volumes --force
|
||||||
|
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
# Premerge benchmark
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
27
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
27
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This file contains the downloading link for benchmarking results.
|
||||||
|
|
||||||
|
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||||
|
- [benchmarking results](artifact://results.zip)
|
||||||
|
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||||
|
|
||||||
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
## Results reproduction
|
||||||
|
|
||||||
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
|
- Deploy the docker, and inside the docker:
|
||||||
|
- Download `nightly-benchmarks.zip`.
|
||||||
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export HF_TOKEN=<your HF token>
|
||||||
|
apt update
|
||||||
|
apt install -y git
|
||||||
|
unzip nightly-benchmarks.zip
|
||||||
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
And the results will be inside `./benchmarks/results`.
|
||||||
39
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
39
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
# Nightly benchmark
|
||||||
|
|
||||||
|
This benchmark aims to:
|
||||||
|
|
||||||
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
- Docker images:
|
||||||
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
|
- Hardware
|
||||||
|
- 8x Nvidia A100 GPUs
|
||||||
|
- Workload:
|
||||||
|
- Dataset
|
||||||
|
- ShareGPT dataset
|
||||||
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
|
## Known issues
|
||||||
|
|
||||||
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
|
- TGI does not support `ignore-eos` flag.
|
||||||
196
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
196
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
common_pod_spec: &common_pod_spec
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
- name: hf-cache
|
||||||
|
hostPath:
|
||||||
|
path: /root/.cache/huggingface
|
||||||
|
type: Directory
|
||||||
|
|
||||||
|
common_container_settings: &common_container_settings
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
- name: hf-cache
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm step 10"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.6.2
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 sglang benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 lmdeploy benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-8B"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama8B"
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-70B"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama70B"
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||||
|
# - label: "A100 trt benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||||
|
# - label: "A100 tgi benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
- wait
|
||||||
|
|
||||||
|
- label: "Collect the results"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.5.0.post1
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
- block: ":rocket: check the results!"
|
||||||
@@ -1,11 +1,10 @@
|
|||||||
# Performance benchmarks descriptions
|
|
||||||
|
|
||||||
## Latency tests
|
## Latency tests
|
||||||
|
|
||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
@@ -16,7 +15,7 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
@@ -28,7 +27,7 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def compare_data_columns(
|
||||||
|
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||||
|
):
|
||||||
|
print("\ncompare_data_column: " + data_column)
|
||||||
|
frames = []
|
||||||
|
compare_frames = []
|
||||||
|
for file in files:
|
||||||
|
data_df = pd.read_json(file)
|
||||||
|
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
|
if ignore_test_name is False:
|
||||||
|
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||||
|
frames.append(serving_df[file + "_name"])
|
||||||
|
serving_df = serving_df.rename(columns={data_column: file})
|
||||||
|
frames.append(serving_df[file])
|
||||||
|
compare_frames.append(serving_df[file])
|
||||||
|
if len(compare_frames) >= 2:
|
||||||
|
# Compare numbers among two files
|
||||||
|
ratio_df = compare_frames[1] / compare_frames[0]
|
||||||
|
frames.append(ratio_df)
|
||||||
|
compare_frames.pop(1)
|
||||||
|
|
||||||
|
concat_df = pd.concat(frames, axis=1)
|
||||||
|
return concat_df
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-f", "--file", action="append", type=str, help="input file name"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
files = args.file
|
||||||
|
print("comparing : " + ", ".join(files))
|
||||||
|
|
||||||
|
drop_column = "P99"
|
||||||
|
name_column = "Test name"
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
|
]
|
||||||
|
ignore_test_name = args.ignore_test_name
|
||||||
|
with open("perf_comparison.html", "w") as text_file:
|
||||||
|
for i in range(len(data_cols_to_compare)):
|
||||||
|
output_df = compare_data_columns(
|
||||||
|
files,
|
||||||
|
name_column,
|
||||||
|
data_cols_to_compare[i],
|
||||||
|
drop_column,
|
||||||
|
ignore_test_name=ignore_test_name,
|
||||||
|
)
|
||||||
|
print(output_df)
|
||||||
|
html = output_df.to_html()
|
||||||
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
text_file.write(html)
|
||||||
@@ -1,19 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shlex
|
|
||||||
from importlib import util
|
from importlib import util
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import psutil
|
import psutil
|
||||||
import regex as re
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
# latency results and the keys that will be printed into markdown
|
# latency results and the keys that will be printed into markdown
|
||||||
latency_results = []
|
latency_results = []
|
||||||
latency_column_mapping = {
|
latency_column_mapping = {
|
||||||
@@ -44,30 +42,19 @@ throughput_results_column_mapping = {
|
|||||||
serving_results = []
|
serving_results = []
|
||||||
serving_column_mapping = {
|
serving_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"model_id": "Model",
|
|
||||||
"dataset_name": "Dataset Name",
|
|
||||||
"input_len": "Input Len",
|
|
||||||
"output_len": "Output Len",
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"dtype": "dtype",
|
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"completed": "# of req.",
|
"completed": "# of req.",
|
||||||
"qps": "qps",
|
|
||||||
"max_concurrency": "# of max concurrency.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
# "total_input_tokens": "Total input tokens",
|
"total_input_tokens": "Total input tokens",
|
||||||
# "total_output_tokens": "Total output tokens",
|
"total_output_tokens": "Total output tokens",
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
"std_ttft_ms": "STD TTFT (ms)",
|
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
"std_tpot_ms": "STD TPOT (ms)",
|
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@@ -106,111 +93,15 @@ def get_size_with_unit(bytes, suffix="B"):
|
|||||||
bytes /= factor
|
bytes /= factor
|
||||||
|
|
||||||
|
|
||||||
def _coerce(val: str) -> Any:
|
|
||||||
"""Best-effort type coercion from string to Python types."""
|
|
||||||
low = val.lower()
|
|
||||||
if low == "null":
|
|
||||||
return None
|
|
||||||
if low == "true":
|
|
||||||
return True
|
|
||||||
if low == "false":
|
|
||||||
return False
|
|
||||||
# integers
|
|
||||||
if re.fullmatch(r"[+-]?\d+", val):
|
|
||||||
try:
|
|
||||||
return int(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
|
||||||
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
|
||||||
try:
|
|
||||||
return float(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
def parse_client_command(cmd: str) -> dict[str, Any]:
|
|
||||||
"""Parse the client_command shell string into {executable, script, args}."""
|
|
||||||
toks = shlex.split(cmd)
|
|
||||||
if len(toks) < 2:
|
|
||||||
raise ValueError("client_command must include an executable and a script")
|
|
||||||
executable, script = toks[0], toks[1]
|
|
||||||
args: dict[str, Any] = {}
|
|
||||||
|
|
||||||
i = 2
|
|
||||||
while i < len(toks):
|
|
||||||
t = toks[i]
|
|
||||||
if t.startswith("--"):
|
|
||||||
# --key=value or --key (value) or boolean flag
|
|
||||||
if "=" in t:
|
|
||||||
key, val = t.split("=", 1)
|
|
||||||
if key == "--metadata":
|
|
||||||
md = {}
|
|
||||||
if val:
|
|
||||||
if "=" in val:
|
|
||||||
k, v = val.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[val] = True
|
|
||||||
args[key] = md
|
|
||||||
else:
|
|
||||||
args[key] = _coerce(val)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
key = t
|
|
||||||
|
|
||||||
# Special: consume metadata k=v pairs until next --flag
|
|
||||||
if key == "--metadata":
|
|
||||||
i += 1
|
|
||||||
md = {}
|
|
||||||
while i < len(toks) and not toks[i].startswith("--"):
|
|
||||||
pair = toks[i]
|
|
||||||
if "=" in pair:
|
|
||||||
k, v = pair.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[pair] = True
|
|
||||||
i += 1
|
|
||||||
args[key] = md
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Standard: check if next token is a value (not a flag)
|
|
||||||
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
|
||||||
args[key] = _coerce(toks[i + 1])
|
|
||||||
i += 2
|
|
||||||
else:
|
|
||||||
# lone flag -> True
|
|
||||||
args[key] = True
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
# unexpected positional; skip
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return {"executable": executable, "script": script, "args": args}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-r",
|
|
||||||
"--result",
|
|
||||||
type=str,
|
|
||||||
default="results",
|
|
||||||
help="Folder name for benchmark output results.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
results_folder = Path(args.result)
|
|
||||||
if not results_folder.exists():
|
|
||||||
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `vllm bench serve` command
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
@@ -218,50 +109,18 @@ if __name__ == "__main__":
|
|||||||
except OSError as e:
|
except OSError as e:
|
||||||
print(e)
|
print(e)
|
||||||
continue
|
continue
|
||||||
# Parse Server Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"server_command": parse_client_command(command["server_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--tensor-parallel-size",
|
|
||||||
"--pipeline-parallel-size",
|
|
||||||
"--dtype",
|
|
||||||
]
|
|
||||||
col_mapping = ["tp_size", "pp_size", "dtype"]
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["server_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["server_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse Client Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"client_command": parse_client_command(command["client_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--dataset-name",
|
|
||||||
"--random-input-len",
|
|
||||||
"--random-output-len",
|
|
||||||
"--request-rate",
|
|
||||||
]
|
|
||||||
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
|
||||||
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["client_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["client_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
# Add Server, Client command
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
raw_result.update({"test_name": test_file.stem})
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
serving_results.append(raw_result)
|
serving_results.append(raw_result)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `vllm bench latency` command
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@@ -289,7 +148,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `vllm bench throughput` command
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@@ -345,10 +204,7 @@ if __name__ == "__main__":
|
|||||||
columns=latency_column_mapping
|
columns=latency_column_mapping
|
||||||
)
|
)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
valid_columns = [
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
col for col in serving_column_mapping if col in serving_results.columns
|
|
||||||
]
|
|
||||||
serving_results = serving_results[valid_columns].rename(
|
|
||||||
columns=serving_column_mapping
|
columns=serving_column_mapping
|
||||||
)
|
)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
@@ -370,7 +226,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
@@ -388,11 +244,9 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
md_file = "benchmark_results.md"
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
json_file = "benchmark_results.json"
|
|
||||||
with open(results_folder / md_file, "w") as f:
|
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/performance-benchmarks/"
|
"../.buildkite/nightly-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
+ "performance-benchmarks-descriptions.md"
|
||||||
)
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
@@ -405,7 +259,7 @@ if __name__ == "__main__":
|
|||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / json_file, "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
results = (
|
results = (
|
||||||
latency_results.to_dict(orient="records")
|
latency_results.to_dict(orient="records")
|
||||||
+ throughput_results.to_dict(orient="records")
|
+ throughput_results.to_dict(orient="records")
|
||||||
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def main(model, cachedir):
|
||||||
|
# Load the tokenizer and save it to the specified directory
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
tokenizer.save_pretrained(cachedir)
|
||||||
|
print(f"Tokenizer saved to {cachedir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download and save Hugging Face tokenizer"
|
||||||
|
)
|
||||||
|
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||||
|
parser.add_argument(
|
||||||
|
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.model, args.cachedir)
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Parse command line arguments for summary-nightly-results script."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--results-folder",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The folder where the results are stored.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--description", type=str, required=True, help="Description of the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf(df, method, model, metric):
|
||||||
|
means = []
|
||||||
|
|
||||||
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
|
target = df["Test name"].str.contains(model)
|
||||||
|
target = target & df["Engine"].str.contains(method)
|
||||||
|
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.0)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
|
return np.array(means)
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
if metric in ["TTFT", "ITL"]:
|
||||||
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||||
|
if std.mean() == 0:
|
||||||
|
std = None
|
||||||
|
success = get_perf(df, method, model, "Successful req.")
|
||||||
|
if std is not None:
|
||||||
|
std = std / np.sqrt(success)
|
||||||
|
std = std.tolist()
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert metric == "Tput"
|
||||||
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
|
df, method, model, "Output Tput (tok/s)"
|
||||||
|
)
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = None
|
||||||
|
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file) as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||||
|
|
||||||
|
with open(args.description) as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
model_name = api_client.available_models[0]
|
||||||
|
|
||||||
|
print(model_name)
|
||||||
@@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
78
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
78
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
(which zip) || (apt-get install -y zip)
|
||||||
|
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# initial annotation
|
||||||
|
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||||
|
|
||||||
|
# download results
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
mkdir -p results/
|
||||||
|
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||||
|
ls
|
||||||
|
ls results/
|
||||||
|
|
||||||
|
# upload benchmark results
|
||||||
|
zip -r results.zip results/
|
||||||
|
/workspace/buildkite-agent artifact upload "results.zip"
|
||||||
|
|
||||||
|
# upload benchmarking scripts
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||||
|
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
# upload benchmarking pipeline
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
|
# # generate figures
|
||||||
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/
|
||||||
|
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sharegpt
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_2048_128
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_128_2048
|
||||||
|
|
||||||
|
# # upload results and figures
|
||||||
|
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
462
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
462
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
@@ -0,0 +1,462 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
get_current_llm_serving_engine() {
|
||||||
|
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "Container: lmdeploy"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "Container: tgi"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "Container: tensorrt-llm"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /sgl-workspace ]; then
|
||||||
|
echo "Container: sglang"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "Container: vllm"
|
||||||
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill -f python
|
||||||
|
pkill -f python3
|
||||||
|
pkill -f tritonserver
|
||||||
|
pkill -f pt_main_thread
|
||||||
|
pkill -f text-generation
|
||||||
|
pkill -f lmdeploy
|
||||||
|
|
||||||
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_installed() {
|
||||||
|
# Ensure that the given command is installed by apt-get
|
||||||
|
local cmd=$1
|
||||||
|
if ! which "$cmd" >/dev/null; then
|
||||||
|
apt-get update && apt-get install -y "$cmd"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
# this is required for lmdeploy.
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
|
||||||
|
|
||||||
|
# change model name for lmdeploy (it will not follow standard hf name)
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||||
|
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ $backend = "trt" ]]; then
|
||||||
|
backend="tensorrt-llm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||||
|
|
||||||
|
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||||
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--sonnet-input-len $sonnet_input_len \
|
||||||
|
--sonnet-output-len $sonnet_output_len \
|
||||||
|
--sonnet-prefix-len $sonnet_prefix_len \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command="None"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
}
|
||||||
|
|
||||||
|
run_genai_perf_tests() {
|
||||||
|
# run genai-perf tests
|
||||||
|
|
||||||
|
# $1: a json file specifying genai-perf test cases
|
||||||
|
local genai_perf_test_file
|
||||||
|
genai_perf_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over genai-perf tests
|
||||||
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps=$num_prompts
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
#TODO: add output dir.
|
||||||
|
client_command="genai-perf profile \
|
||||||
|
-m $model \
|
||||||
|
--service-kind openai \
|
||||||
|
--backend vllm \
|
||||||
|
--endpoint-type chat \
|
||||||
|
--streaming \
|
||||||
|
--url localhost:$port \
|
||||||
|
--request-rate $qps \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
#TODO: process/record outputs
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_dataset() {
|
||||||
|
|
||||||
|
# download sharegpt dataset
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
echo "" > sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
# check if the environment variable is successfully injected from yaml
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
get_current_llm_serving_engine
|
||||||
|
|
||||||
|
pip install -U transformers
|
||||||
|
|
||||||
|
pip install -r requirements/dev.txt
|
||||||
|
which genai-perf
|
||||||
|
|
||||||
|
# check storage
|
||||||
|
df -h
|
||||||
|
|
||||||
|
ensure_installed wget
|
||||||
|
ensure_installed curl
|
||||||
|
ensure_installed jq
|
||||||
|
# genai-perf dependency
|
||||||
|
ensure_installed libb64-0d
|
||||||
|
|
||||||
|
prepare_dataset
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
|
||||||
|
# run the test
|
||||||
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
|
# run genai-perf tests
|
||||||
|
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||||
|
mv artifacts/ $RESULTS_FOLDER/
|
||||||
|
|
||||||
|
# upload benchmark results to buildkite
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -15,8 +15,6 @@ check_gpus() {
|
|||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@@ -25,23 +23,17 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
declare -g arch_suffix=''
|
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
|
||||||
arch_suffix='-hpu'
|
|
||||||
fi
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
check_cpus() {
|
check_cpus() {
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
# check the number of CPUs and NUMA Node and GPU type.
|
||||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
echo "NUMA found."
|
echo "NUMA found."
|
||||||
echo $numa_count
|
echo $numa_count
|
||||||
@@ -110,8 +102,7 @@ json2envs() {
|
|||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
local timeout_val="1200"
|
timeout 1200 bash -c '
|
||||||
timeout "$timeout_val" bash -c '
|
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done' && return 0 || return 1
|
done' && return 0 || return 1
|
||||||
@@ -135,8 +126,7 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pgrep VLLM | xargs -r kill -9
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
@@ -147,10 +137,6 @@ kill_gpu_processes() {
|
|||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
elif command -v hl-smi; then
|
|
||||||
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@@ -178,7 +164,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `vllm bench latency` command
|
# run latency tests using `benchmark_latency.py`
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@@ -207,11 +193,9 @@ run_latency_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@@ -221,7 +205,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs vllm bench latency \
|
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@@ -247,7 +231,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `vllm bench throughput`
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@@ -276,11 +260,9 @@ run_throughput_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@@ -290,7 +272,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs vllm bench throughput \
|
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@@ -315,46 +297,14 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `benchmark_serving.py`
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
#
|
|
||||||
# Supported JSON formats:
|
|
||||||
# 1) Plain format: top-level array
|
|
||||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
#
|
|
||||||
# 2) Default parameters field + plain format tests
|
|
||||||
# {
|
|
||||||
# "defaults": { ... },
|
|
||||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
# }
|
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
serving_test_file=$1
|
serving_test_file=$1
|
||||||
|
|
||||||
# Iterate over serving tests
|
# Iterate over serving tests
|
||||||
jq -c '
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
if type == "array" then
|
|
||||||
# Plain format: test cases array
|
|
||||||
.[]
|
|
||||||
elif (type == "object" and has("tests")) then
|
|
||||||
# merge the default parameters into each test cases
|
|
||||||
. as $root
|
|
||||||
| ($root.defaults // {}) as $d
|
|
||||||
| ($root.tests // [])[]
|
|
||||||
# default qps / max_concurrency from defaults if missing
|
|
||||||
| .qps_list = (.qps_list // $d.qps_list)
|
|
||||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
|
||||||
# merge envs / params: test overrides defaults
|
|
||||||
| .server_environment_variables =
|
|
||||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
|
||||||
| .server_parameters =
|
|
||||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
|
||||||
| .client_parameters =
|
|
||||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
|
||||||
else
|
|
||||||
error("Unsupported serving test file format: must be array or object with .tests")
|
|
||||||
end
|
|
||||||
' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
@@ -368,36 +318,22 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get client and server arguments (after merged the default parameters)
|
# get client and server arguments
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
server_envs=$(json2envs "$server_envs")
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
|
|
||||||
# qps_list
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
# max_concurrency_list (fallback to num_prompts if missing)
|
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
|
||||||
max_concurrency_list="[$num_prompts]"
|
|
||||||
fi
|
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
|
||||||
|
|
||||||
# check if there is enough resources to run the test
|
# check if there is enough resources to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@@ -415,7 +351,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
@@ -452,39 +389,35 @@ run_serving_tests() {
|
|||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different max_concurrency
|
new_test_name=$test_name"_qps_"$qps
|
||||||
for max_concurrency in $max_concurrency_list; do
|
|
||||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
|
||||||
echo " new test name $new_test_name"
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--max-concurrency $max_concurrency \
|
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args $client_remote_args "
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
echo "Client command: $client_command"
|
# on the benchmark dashboard
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--metadata "tensor_parallel_size=$tp" \
|
||||||
|
$client_args $client_remote_args "
|
||||||
|
|
||||||
bash -c "$client_command"
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
# record the benchmarking commands
|
bash -c "$client_command"
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
# record the benchmarking commands
|
||||||
--arg client "$client_command" \
|
jq_output=$(jq -n \
|
||||||
--arg gpu "$gpu_type" \
|
--arg server "$server_command" \
|
||||||
'{
|
--arg client "$client_command" \
|
||||||
server_command: $server,
|
--arg gpu "$gpu_type" \
|
||||||
client_command: $client,
|
'{
|
||||||
gpu_type: $gpu
|
server_command: $server,
|
||||||
}')
|
client_command: $client,
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
@@ -501,16 +434,20 @@ main() {
|
|||||||
ARCH='-cpu'
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
ARCH="$arch_suffix"
|
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
# Set to v1 to run v1 benchmark
|
||||||
|
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by `vllm bench serve` command
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
@@ -520,12 +457,7 @@ main() {
|
|||||||
ensure_sharegpt_downloaded
|
ensure_sharegpt_downloaded
|
||||||
declare -g RESULTS_FOLDER=results/
|
declare -g RESULTS_FOLDER=results/
|
||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
# dump vllm info via vllm collect-env
|
|
||||||
env_output=$(vllm collect-env)
|
|
||||||
|
|
||||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"completed": "Successful req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
"std_tpot_ms": "Std TPOT (ms)",
|
||||||
|
"median_tpot_ms": "Median TPOT (ms)",
|
||||||
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"total_input_tokens": "Total input tokens",
|
||||||
|
"total_output_tokens": "Total output tokens",
|
||||||
|
"engine": "Engine",
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
with open(test_file) as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping
|
||||||
|
)
|
||||||
|
|
||||||
|
serving_md_table_with_headers = tabulate(
|
||||||
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
|
)
|
||||||
|
# remove the first line of header
|
||||||
|
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||||
|
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
|
|
||||||
|
# document benchmarking results in markdown
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||||
|
# document results with header.
|
||||||
|
# for those who wants to reproduce our benchmark.
|
||||||
|
f.write(serving_md_table_with_headers)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
results = serving_results.to_dict(orient="records")
|
||||||
|
f.write(json.dumps(results))
|
||||||
23
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
23
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
|
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
else
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
retries=0
|
||||||
|
while [ $retries -lt 1000 ]; do
|
||||||
|
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for image to be available..."
|
||||||
|
|
||||||
|
retries=$((retries + 1))
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 1
|
||||||
@@ -11,7 +11,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -35,7 +35,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@@ -88,7 +90,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@@ -141,7 +145,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@@ -191,7 +197,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@@ -243,7 +251,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@@ -295,7 +305,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 1024,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
|
"num_prompts": 100
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 6,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 1024,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
|
"num_prompts": 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -7,6 +7,7 @@
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@@ -25,6 +26,7 @@
|
|||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@@ -43,6 +45,7 @@
|
|||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@@ -58,6 +61,7 @@
|
|||||||
"qps_list": [2],
|
"qps_list": [2],
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_config": {
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -1,193 +0,0 @@
|
|||||||
# vLLM benchmark suite
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
|
||||||
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
|
||||||
|
|
||||||
The benchmark needs to be triggered manually:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Runtime environment variables:
|
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
|
||||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
|
||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
|
|
||||||
## Performance benchmark details
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
|
||||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
|
||||||
>
|
|
||||||
### Latency test
|
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
In this example:
|
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
|
||||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
|
||||||
|
|
||||||
### Throughput test
|
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
|
||||||
|
|
||||||
### Serving test
|
|
||||||
|
|
||||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
Inside this example:
|
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
|
||||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
|
||||||
|
|
||||||
#### Default Parameters Field
|
|
||||||
|
|
||||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary> An Example of default parameters field </summary>
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"block_size": 128,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 200,
|
|
||||||
"ignore-eos": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### Visualizing the results
|
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
|
||||||
|
|
||||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
|
||||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
|
||||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
|
||||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
|
||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
|
||||||
|
|
||||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
|
||||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
|
||||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
|
||||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
|
||||||
|
|
||||||
A comparison diagram will be generated below the table.
|
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
|
||||||
@@ -1,456 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from importlib import util
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
pd.options.display.float_format = "{:.2f}".format
|
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
|
||||||
|
|
||||||
|
|
||||||
def compare_data_columns(
|
|
||||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Align concatenation by keys derived from info_cols instead of row order.
|
|
||||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
|
||||||
- For each file: set index to those keys, aggregate duplicates
|
|
||||||
- (mean for metric, first for names).
|
|
||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
|
||||||
- group by columns.
|
|
||||||
- If --debug, add a <file_label>_name column per file.
|
|
||||||
"""
|
|
||||||
print("\ncompare_data_column:", data_column)
|
|
||||||
|
|
||||||
frames = []
|
|
||||||
raw_data_cols = []
|
|
||||||
compare_frames = []
|
|
||||||
|
|
||||||
# 1) choose a canonical key list from info_cols that exists in ALL files
|
|
||||||
cols_per_file = []
|
|
||||||
for f in files:
|
|
||||||
try:
|
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
|
||||||
except Exception as err:
|
|
||||||
raise ValueError(f"Failed to read {f}") from err
|
|
||||||
cols_per_file.append(set(df_tmp.columns))
|
|
||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
|
||||||
if not key_cols:
|
|
||||||
# soft fallback: use any info_cols present in the first file
|
|
||||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
|
||||||
if not key_cols:
|
|
||||||
raise ValueError(
|
|
||||||
"No common key columns found from info_cols across the input files."
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
|
||||||
meta_added = False
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
df = pd.read_json(file, orient="records")
|
|
||||||
|
|
||||||
# Keep rows that actually have the compared metric (same as original behavior)
|
|
||||||
if drop_column in df.columns:
|
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
|
||||||
|
|
||||||
# Stabilize numeric key columns (harmless if missing)
|
|
||||||
for c in (
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
):
|
|
||||||
if c in df.columns:
|
|
||||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
||||||
|
|
||||||
# Ensure all key columns exist
|
|
||||||
for c in key_cols:
|
|
||||||
if c not in df.columns:
|
|
||||||
df[c] = pd.NA
|
|
||||||
|
|
||||||
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
|
||||||
df_idx = df.set_index(key_cols, drop=False)
|
|
||||||
|
|
||||||
# meta (key columns), unique per key
|
|
||||||
meta = df_idx[key_cols]
|
|
||||||
if not meta.index.is_unique:
|
|
||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
|
||||||
|
|
||||||
# metric series for this file, aggregated to one row per key
|
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
|
||||||
s = df_idx[data_column]
|
|
||||||
if not s.index.is_unique:
|
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
|
||||||
s.name = file_label # column label like original
|
|
||||||
|
|
||||||
# add meta once (from first file) so keys are the leftmost columns
|
|
||||||
if not meta_added:
|
|
||||||
frames.append(meta)
|
|
||||||
meta_added = True
|
|
||||||
|
|
||||||
# (NEW) debug: aligned test-name column per file
|
|
||||||
if debug and name_column in df_idx.columns:
|
|
||||||
name_s = df_idx[name_column]
|
|
||||||
if not name_s.index.is_unique:
|
|
||||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
|
||||||
name_s.name = f"{file_label}_name"
|
|
||||||
frames.append(name_s)
|
|
||||||
|
|
||||||
frames.append(s)
|
|
||||||
raw_data_cols.append(file_label)
|
|
||||||
compare_frames.append(s)
|
|
||||||
|
|
||||||
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
|
||||||
if len(compare_frames) >= 2:
|
|
||||||
base = compare_frames[0]
|
|
||||||
current = compare_frames[-1]
|
|
||||||
if "P99" in data_column or "Median" in data_column:
|
|
||||||
ratio = base / current # for latency
|
|
||||||
else:
|
|
||||||
ratio = current / base
|
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
|
||||||
frames.append(ratio)
|
|
||||||
|
|
||||||
# 4) concat on columns with aligned MultiIndex;
|
|
||||||
# then reset_index to return keys as columns
|
|
||||||
concat_df = pd.concat(frames, axis=1)
|
|
||||||
concat_df = concat_df.reset_index(drop=True).reset_index()
|
|
||||||
if "index" in concat_df.columns:
|
|
||||||
concat_df = concat_df.drop(columns=["index"])
|
|
||||||
|
|
||||||
# Ensure key/info columns appear first (in your info_cols order)
|
|
||||||
front = [c for c in info_cols if c in concat_df.columns]
|
|
||||||
rest = [c for c in concat_df.columns if c not in front]
|
|
||||||
concat_df = concat_df[front + rest]
|
|
||||||
|
|
||||||
print(raw_data_cols)
|
|
||||||
return concat_df, raw_data_cols
|
|
||||||
|
|
||||||
|
|
||||||
def split_json_by_tp_pp(
|
|
||||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
|
||||||
) -> list[str]:
|
|
||||||
"""
|
|
||||||
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
|
||||||
|
|
||||||
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
|
||||||
Returns: list of file paths written.
|
|
||||||
"""
|
|
||||||
# Load JSON data into DataFrame
|
|
||||||
with open(input_file, encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# If the JSON is a dict with a list under common keys, use that list
|
|
||||||
if isinstance(data, dict):
|
|
||||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
|
||||||
if isinstance(data.get(key), list):
|
|
||||||
data = data[key]
|
|
||||||
break
|
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
# Keep only "serving" tests
|
|
||||||
name_col = next(
|
|
||||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
|
||||||
)
|
|
||||||
if name_col:
|
|
||||||
df = df[
|
|
||||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
|
||||||
].copy()
|
|
||||||
|
|
||||||
# Handle alias column names
|
|
||||||
rename_map = {
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"tensor_parallel_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"pipeline_parallel_size": "PP Size",
|
|
||||||
}
|
|
||||||
df.rename(
|
|
||||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure TP/PP columns exist (default to 1 if missing)
|
|
||||||
if "TP Size" not in df.columns:
|
|
||||||
df["TP Size"] = 1
|
|
||||||
if "PP Size" not in df.columns:
|
|
||||||
df["PP Size"] = 1
|
|
||||||
|
|
||||||
# make sure TP/PP are numeric ints with no NaN
|
|
||||||
df["TP Size"] = (
|
|
||||||
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
df["PP Size"] = (
|
|
||||||
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split into separate folders
|
|
||||||
saved_paths: list[str] = []
|
|
||||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
|
||||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
|
||||||
os.makedirs(folder_name, exist_ok=True)
|
|
||||||
filepath = os.path.join(folder_name, "benchmark_results.json")
|
|
||||||
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
|
|
||||||
print(f"Saved: {filepath}")
|
|
||||||
saved_paths.append(filepath)
|
|
||||||
|
|
||||||
return saved_paths
|
|
||||||
|
|
||||||
|
|
||||||
def _add_limit_line(fig, y_value, label):
|
|
||||||
# Visible dashed line + annotation
|
|
||||||
fig.add_hline(
|
|
||||||
y=y_value,
|
|
||||||
line_dash="dash",
|
|
||||||
line_color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
annotation_text=f"{label}: {y_value} ms",
|
|
||||||
annotation_position="top left",
|
|
||||||
)
|
|
||||||
# Optional: add a legend item (as a transparent helper trace)
|
|
||||||
if plot and plotly_found:
|
|
||||||
import plotly.graph_objects as go
|
|
||||||
|
|
||||||
fig.add_trace(
|
|
||||||
go.Scatter(
|
|
||||||
x=[None],
|
|
||||||
y=[None],
|
|
||||||
mode="lines",
|
|
||||||
line=dict(
|
|
||||||
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
|
||||||
),
|
|
||||||
name=f"{label}",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|
||||||
for c in [
|
|
||||||
"# of max concurrency.",
|
|
||||||
"# of max concurrency",
|
|
||||||
"Max Concurrency",
|
|
||||||
"max_concurrency",
|
|
||||||
"Concurrency",
|
|
||||||
]:
|
|
||||||
if c in df.columns:
|
|
||||||
return c
|
|
||||||
# Fallback: guess an integer-like column (harmless if unused)
|
|
||||||
for c in df.columns:
|
|
||||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
|
||||||
return c
|
|
||||||
return "# of max concurrency."
|
|
||||||
|
|
||||||
|
|
||||||
def _highlight_threshold(
|
|
||||||
df: pd.DataFrame, threshold: float
|
|
||||||
) -> "pd.io.formats.style.Styler":
|
|
||||||
"""Highlight numeric per-configuration columns with value <= threshold."""
|
|
||||||
conc_col = _find_concurrency_col(df)
|
|
||||||
key_cols = [
|
|
||||||
c
|
|
||||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
|
||||||
if c in df.columns
|
|
||||||
]
|
|
||||||
conf_cols = [
|
|
||||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
|
||||||
]
|
|
||||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
||||||
return df.style.map(
|
|
||||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
|
||||||
if pd.notna(v) and v <= threshold
|
|
||||||
else "",
|
|
||||||
subset=conf_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-f", "--file", action="append", type=str, help="input file name"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--debug", action="store_true", help="show all information for debugging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--plot",
|
|
||||||
action=argparse.BooleanOptionalAction,
|
|
||||||
default=True,
|
|
||||||
help="plot perf diagrams or not --no-plot --plot",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-x",
|
|
||||||
"--xaxis",
|
|
||||||
type=str,
|
|
||||||
default="# of max concurrency.",
|
|
||||||
help="column name to use as X Axis in comparison graph",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-l",
|
|
||||||
"--latency",
|
|
||||||
type=str,
|
|
||||||
default="p99",
|
|
||||||
help="take median|p99 for latency like TTFT/TPOT",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--ttft-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=3000.0,
|
|
||||||
help="Reference limit for TTFT plots (ms)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--tpot-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=100.0,
|
|
||||||
help="Reference limit for TPOT plots (ms)",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
drop_column = "P99"
|
|
||||||
name_column = "Test name"
|
|
||||||
info_cols = [
|
|
||||||
"Model",
|
|
||||||
"Dataset Name",
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
]
|
|
||||||
|
|
||||||
if "median" in args.latency:
|
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
|
||||||
html_msgs_for_data_cols = [
|
|
||||||
"Compare Output Tokens /n",
|
|
||||||
"Median TTFT /n",
|
|
||||||
"Median TPOT /n",
|
|
||||||
]
|
|
||||||
drop_column = "P99"
|
|
||||||
elif "p99" in args.latency:
|
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
|
||||||
html_msgs_for_data_cols = [
|
|
||||||
"Compare Output Tokens /n",
|
|
||||||
"P99 TTFT /n",
|
|
||||||
"P99 TPOT /n",
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(args.file) == 1:
|
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
|
||||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
|
||||||
else:
|
|
||||||
files = args.file
|
|
||||||
print("comparing : " + ", ".join(files))
|
|
||||||
debug = args.debug
|
|
||||||
plot = args.plot
|
|
||||||
# For Plot feature, assign y axis from one of info_cols
|
|
||||||
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
|
||||||
with open("perf_comparison.html", "w") as text_file:
|
|
||||||
for i in range(len(data_cols_to_compare)):
|
|
||||||
output_df, raw_data_cols = compare_data_columns(
|
|
||||||
files,
|
|
||||||
name_column,
|
|
||||||
data_cols_to_compare[i],
|
|
||||||
info_cols,
|
|
||||||
drop_column,
|
|
||||||
debug=debug,
|
|
||||||
)
|
|
||||||
|
|
||||||
# For Plot feature, insert y axis from one of info_cols
|
|
||||||
raw_data_cols.insert(0, info_cols[y_axis_index])
|
|
||||||
|
|
||||||
filtered_info_cols = info_cols[:-2]
|
|
||||||
existing_group_cols = [
|
|
||||||
c for c in filtered_info_cols if c in output_df.columns
|
|
||||||
]
|
|
||||||
if not existing_group_cols:
|
|
||||||
raise ValueError(
|
|
||||||
f"No valid group-by columns "
|
|
||||||
f"Expected subset: {filtered_info_cols}, "
|
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
|
||||||
)
|
|
||||||
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
|
||||||
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
|
||||||
for name, group in output_groups:
|
|
||||||
group_name = (
|
|
||||||
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
|
||||||
)
|
|
||||||
group_html_name = "perf_comparison_" + group_name + ".html"
|
|
||||||
|
|
||||||
metric_name = str(data_cols_to_compare[i]).lower()
|
|
||||||
if "tok/s" in metric_name:
|
|
||||||
html = group.to_html()
|
|
||||||
elif "ttft" in metric_name:
|
|
||||||
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
|
||||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
|
||||||
na_rep="—",
|
|
||||||
)
|
|
||||||
html = styler.to_html(
|
|
||||||
table_attributes='border="1" class="dataframe"'
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
"tpot" in metric_name
|
|
||||||
or "median" in metric_name
|
|
||||||
or "p99" in metric_name
|
|
||||||
):
|
|
||||||
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
|
||||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
|
||||||
na_rep="—",
|
|
||||||
)
|
|
||||||
html = styler.to_html(
|
|
||||||
table_attributes='border="1" class="dataframe"'
|
|
||||||
)
|
|
||||||
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
|
||||||
text_file.write(html)
|
|
||||||
with open(group_html_name, "a+") as sub_text_file:
|
|
||||||
sub_text_file.write(html_msgs_for_data_cols[i])
|
|
||||||
sub_text_file.write(html)
|
|
||||||
|
|
||||||
if plot and plotly_found:
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
|
||||||
# Melt DataFrame for plotting
|
|
||||||
df_melted = df_sorted.melt(
|
|
||||||
id_vars=info_cols[y_axis_index],
|
|
||||||
var_name="Configuration",
|
|
||||||
value_name=data_cols_to_compare[i],
|
|
||||||
)
|
|
||||||
title = (
|
|
||||||
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
|
||||||
)
|
|
||||||
# Create Plotly line chart
|
|
||||||
fig = px.line(
|
|
||||||
df_melted,
|
|
||||||
x=info_cols[y_axis_index],
|
|
||||||
y=data_cols_to_compare[i],
|
|
||||||
color="Configuration",
|
|
||||||
title=title,
|
|
||||||
markers=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ---- Add threshold lines based on metric name ----
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
|
||||||
elif (
|
|
||||||
"tpot" in metric_name
|
|
||||||
or "median" in metric_name
|
|
||||||
or "p99" in metric_name
|
|
||||||
):
|
|
||||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
|
||||||
|
|
||||||
# Export to HTML
|
|
||||||
text_file.write(
|
|
||||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
)
|
|
||||||
sub_text_file.write(
|
|
||||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
)
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,246 +0,0 @@
|
|||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
||||||
@@ -1,136 +1,90 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
- label: "Build wheel - CUDA 12.8"
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
id: build-wheel-cuda-12-8
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cuda-12-9
|
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build arm64 wheel - CUDA 13.0"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
depends_on: ~
|
id: build-wheel-cuda-12-6
|
||||||
id: build-wheel-arm64-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# aarch64 build
|
|
||||||
- label: "Build arm64 CPU wheel"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cpu
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-9
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 13.0"
|
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
||||||
depends_on: ~
|
# However, this block can be uncommented to save some compute hours.
|
||||||
id: build-wheel-cuda-13-0
|
# - block: "Build CUDA 11.8 wheel"
|
||||||
|
# key: block-build-cu118-wheel
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 11.8"
|
||||||
|
# depends_on: block-build-cu118-wheel
|
||||||
|
id: build-wheel-cuda-11-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 CPU wheel build
|
- block: "Build release image"
|
||||||
- label: "Build x86 CPU wheel"
|
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-x86-cpu
|
key: block-release-image-build
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# Build release images (12.9)
|
- label: "Build release image"
|
||||||
- label: "Build release image (x86)"
|
depends_on: block-release-image-build
|
||||||
depends_on: ~
|
id: build-release-image
|
||||||
id: build-release-image-x86
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build release image (arm64)"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-arm64
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
|
||||||
- label: "Create multi-arch manifest"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86
|
|
||||||
- build-release-image-arm64
|
|
||||||
id: create-multi-arch-manifest
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- build-release-image
|
||||||
|
- build-wheel-cuda-12-8
|
||||||
|
- build-wheel-cuda-12-6
|
||||||
|
- build-wheel-cuda-11-8
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
|
- label: "Build and publish TPU release image"
|
||||||
|
depends_on: ~
|
||||||
|
if: build.env("NIGHTLY") == "1"
|
||||||
|
agents:
|
||||||
|
queue: tpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "yes | docker system prune -a"
|
||||||
|
- "git fetch --all"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||||
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
|
plugins:
|
||||||
|
- docker-login#v3.0.0:
|
||||||
|
username: vllmbot
|
||||||
|
password-env: DOCKERHUB_TOKEN
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
@@ -147,52 +101,24 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build arm64 CPU release image"
|
- block: "Build Neuron release image"
|
||||||
key: block-arm64-cpu-release-image-build
|
key: block-neuron-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
|
|
||||||
- label: "Build and publish arm64 CPU release image"
|
- label: "Build and publish Neuron release image"
|
||||||
depends_on: block-arm64-cpu-release-image-build
|
depends_on: block-neuron-release-image-build
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: neuron-postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
|||||||
@@ -2,53 +2,30 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
# Get release version and strip leading 'v' if present
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||||
if [ -z "${RELEASE_VERSION}" ]; then
|
|
||||||
RELEASE_VERSION="1.0.0.dev"
|
if [ -z "$RELEASE_VERSION" ]; then
|
||||||
|
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
To download the wheel (by commit):
|
To download the wheel:
|
||||||
\`\`\`
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
To download the wheel (by version):
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
docker push vllm/vllm-openai:latest
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
docker push vllm/vllm-openai:latest-x86_64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker push vllm/vllm-openai:latest-aarch64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker manifest push vllm/vllm-openai:latest
|
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
||||||
@@ -1,120 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
|
||||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
|
||||||
|
|
||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
|
||||||
|
|
||||||
# Get DockerHub credentials from environment
|
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
|
||||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get DockerHub bearer token
|
|
||||||
echo "Getting DockerHub bearer token..."
|
|
||||||
set +x
|
|
||||||
BEARER_TOKEN=$(curl -s -X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
|
||||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
|
||||||
echo "Error: Failed to get DockerHub bearer token"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
|
||||||
get_all_tags() {
|
|
||||||
local page=1
|
|
||||||
local all_tags=""
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
|
||||||
|
|
||||||
if [ -z "$tags" ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
all_tags="$all_tags$tags"$'\n'
|
|
||||||
page=$((page + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
# Sort by timestamp (newest first) and extract just the tag names
|
|
||||||
echo "$all_tags" | sort -r | cut -d'|' -f2
|
|
||||||
}
|
|
||||||
|
|
||||||
delete_tag() {
|
|
||||||
local tag_name="$1"
|
|
||||||
echo "Deleting tag: $tag_name"
|
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
|
||||||
else
|
|
||||||
echo "Successfully deleted tag: $tag_name"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
|
|
||||||
echo "Fetching all tags from DockerHub..."
|
|
||||||
all_tags=$(get_all_tags)
|
|
||||||
|
|
||||||
if [ -z "$all_tags" ]; then
|
|
||||||
echo "No tags found to clean up"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Count total tags
|
|
||||||
total_tags=$(echo "$all_tags" | wc -l)
|
|
||||||
echo "Found $total_tags tags"
|
|
||||||
|
|
||||||
# Keep only the last 14 builds (including the current one)
|
|
||||||
tags_to_keep=14
|
|
||||||
tags_to_delete=$((total_tags - tags_to_keep))
|
|
||||||
|
|
||||||
if [ $tags_to_delete -le 0 ]; then
|
|
||||||
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
|
|
||||||
|
|
||||||
# Get tags to delete (skip the first $tags_to_keep tags)
|
|
||||||
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
|
|
||||||
|
|
||||||
if [ -z "$tags_to_delete_list" ]; then
|
|
||||||
echo "No tags to delete"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Delete old tags
|
|
||||||
echo "Deleting old tags..."
|
|
||||||
while IFS= read -r tag; do
|
|
||||||
if [ -n "$tag" ]; then
|
|
||||||
delete_tag "$tag"
|
|
||||||
# Add a small delay to avoid rate limiting
|
|
||||||
sleep 1
|
|
||||||
fi
|
|
||||||
done <<< "$tags_to_delete_list"
|
|
||||||
|
|
||||||
echo "Cleanup completed successfully"
|
|
||||||
@@ -1,400 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# do not complain about line length (for docstring)
|
|
||||||
# ruff: noqa: E501
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from dataclasses import asdict, dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
import regex as re
|
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
|
||||||
|
|
||||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<!-- {comment} -->
|
|
||||||
<meta name="pypi:repository-version" content="1.0">
|
|
||||||
<body>
|
|
||||||
{items}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class WheelFileInfo:
|
|
||||||
package_name: str
|
|
||||||
version: str
|
|
||||||
build_tag: str | None
|
|
||||||
python_tag: str
|
|
||||||
abi_tag: str
|
|
||||||
platform_tag: str
|
|
||||||
variant: str | None
|
|
||||||
filename: str
|
|
||||||
|
|
||||||
|
|
||||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
|
||||||
"""
|
|
||||||
Parse wheel file name to extract metadata.
|
|
||||||
|
|
||||||
The format of wheel names:
|
|
||||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
|
||||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
|
||||||
Example:
|
|
||||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
"""
|
|
||||||
wheel_file_re = re.compile(
|
|
||||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
|
||||||
)
|
|
||||||
match = wheel_file_re.match(file)
|
|
||||||
if not match:
|
|
||||||
raise ValueError(f"Invalid wheel file name: {file}")
|
|
||||||
|
|
||||||
package_name = match.group("package_name")
|
|
||||||
version = match.group("version")
|
|
||||||
build_tag = match.group("build_tag")
|
|
||||||
python_tag = match.group("python_tag")
|
|
||||||
abi_tag = match.group("abi_tag")
|
|
||||||
platform_tag = match.group("platform_tag")
|
|
||||||
|
|
||||||
# extract variant from version
|
|
||||||
variant = None
|
|
||||||
if "dev" in version:
|
|
||||||
ver_after_dev = version.split("dev")[-1]
|
|
||||||
if "." in ver_after_dev:
|
|
||||||
variant = ver_after_dev.split(".")[-1]
|
|
||||||
version = version.removesuffix("." + variant)
|
|
||||||
else:
|
|
||||||
if "+" in version:
|
|
||||||
version, variant = version.split("+")
|
|
||||||
|
|
||||||
return WheelFileInfo(
|
|
||||||
package_name=package_name,
|
|
||||||
version=version,
|
|
||||||
build_tag=build_tag,
|
|
||||||
python_tag=python_tag,
|
|
||||||
abi_tag=abi_tag,
|
|
||||||
platform_tag=platform_tag,
|
|
||||||
variant=variant,
|
|
||||||
filename=file,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
|
||||||
"""
|
|
||||||
Generate project list HTML content linking to each project & variant sub-directory.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
for name in sorted(subdir_names):
|
|
||||||
name = name.strip("/").strip(".")
|
|
||||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
|
||||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_package_index_and_metadata(
|
|
||||||
wheel_files: list[WheelFileInfo],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
comment: str = "",
|
|
||||||
) -> tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
metadata = []
|
|
||||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
|
||||||
relative_path = (
|
|
||||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
|
||||||
)
|
|
||||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
|
||||||
# NOTE: this is AWS S3 specific behavior!
|
|
||||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
|
||||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
|
||||||
file_meta = asdict(file)
|
|
||||||
file_meta["path"] = file_path_quoted
|
|
||||||
metadata.append(file_meta)
|
|
||||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
metadata_str = json.dumps(metadata, indent=2)
|
|
||||||
return index_str, metadata_str
|
|
||||||
|
|
||||||
|
|
||||||
def generate_index_and_metadata(
|
|
||||||
whl_files: list[str],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
default_variant: str | None = None,
|
|
||||||
alias_to_default: str | None = None,
|
|
||||||
comment: str = "",
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Generate index for all wheel files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
|
||||||
wheel_base_dir (Path): Base directory for wheel files.
|
|
||||||
index_base_dir (Path): Base directory to store index files.
|
|
||||||
default_variant (str | None): The default variant name, if any.
|
|
||||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
|
||||||
comment (str | None): Optional comment to include in the generated HTML files.
|
|
||||||
|
|
||||||
First, parse all wheel files to extract metadata.
|
|
||||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
|
||||||
The index for the default variant (if any) is generated in the root index directory.
|
|
||||||
|
|
||||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
|
||||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
|
||||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
|
||||||
|
|
||||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
|
||||||
as the default variant index, but the links are adjusted accordingly.
|
|
||||||
|
|
||||||
Index directory structure:
|
|
||||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
|
||||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
|
||||||
vllm/
|
|
||||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
|
||||||
metadata.json # machine-readable metadata for all wheels in this package
|
|
||||||
cpu/ # cpu variant sub-directory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu129/ # cu129 is actually the alias to default variant
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu130/ # cu130 variant sub-directory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
...
|
|
||||||
|
|
||||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"package_name": "vllm",
|
|
||||||
"version": "0.10.2rc2",
|
|
||||||
"build_tag": null,
|
|
||||||
"python_tag": "cp38",
|
|
||||||
"abi_tag": "abi3",
|
|
||||||
"platform_tag": "manylinux2014_aarch64",
|
|
||||||
"variant": "cu129",
|
|
||||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
|
||||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
|
||||||
|
|
||||||
if not parsed_files:
|
|
||||||
print("No wheel files found, skipping index generation.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Group by variant
|
|
||||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
|
||||||
for file in parsed_files:
|
|
||||||
variant = file.variant or "default"
|
|
||||||
if variant not in variant_to_files:
|
|
||||||
variant_to_files[variant] = []
|
|
||||||
variant_to_files[variant].append(file)
|
|
||||||
|
|
||||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
|
||||||
|
|
||||||
# sanity check for default variant
|
|
||||||
if default_variant:
|
|
||||||
if "default" in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
|
||||||
)
|
|
||||||
if default_variant not in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Default variant '{default_variant}' not found among wheel files."
|
|
||||||
)
|
|
||||||
|
|
||||||
if alias_to_default:
|
|
||||||
if "default" not in variant_to_files:
|
|
||||||
# e.g. only some wheels are uploaded to S3 currently
|
|
||||||
print(
|
|
||||||
"[WARN] Alias to default variant specified, but no default variant found."
|
|
||||||
)
|
|
||||||
elif alias_to_default in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
|
||||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
|
||||||
|
|
||||||
# Generate comment in HTML header
|
|
||||||
comment_str = f" ({comment})" if comment else ""
|
|
||||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
|
||||||
|
|
||||||
# Generate index for each variant
|
|
||||||
subdir_names = set()
|
|
||||||
for variant, files in variant_to_files.items():
|
|
||||||
if variant == "default":
|
|
||||||
variant_dir = index_base_dir
|
|
||||||
else:
|
|
||||||
variant_dir = index_base_dir / variant
|
|
||||||
subdir_names.add(variant)
|
|
||||||
|
|
||||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# gather all package names in this variant
|
|
||||||
packages = set(f.package_name for f in files)
|
|
||||||
if variant == "default":
|
|
||||||
# these packages should also appear in the "project list"
|
|
||||||
# generate after all variants are processed
|
|
||||||
subdir_names = subdir_names.union(packages)
|
|
||||||
else:
|
|
||||||
# generate project list for this variant directly
|
|
||||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
|
||||||
with open(variant_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
for package in packages:
|
|
||||||
# filter files belonging to this package only
|
|
||||||
package_files = [f for f in files if f.package_name == package]
|
|
||||||
package_dir = variant_dir / package
|
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
|
||||||
package_files, wheel_base_dir, package_dir, comment
|
|
||||||
)
|
|
||||||
with open(package_dir / "index.html", "w") as f:
|
|
||||||
f.write(index_str)
|
|
||||||
with open(package_dir / "metadata.json", "w") as f:
|
|
||||||
f.write(metadata_str)
|
|
||||||
|
|
||||||
# Generate top-level project list index
|
|
||||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
|
||||||
with open(index_base_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
"""
|
|
||||||
Arguments:
|
|
||||||
--version <version> : version string for the current build (e.g., commit hash)
|
|
||||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
|
||||||
--output-dir <output_directory> : directory to store generated index files
|
|
||||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
|
||||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
|
||||||
"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Process nightly build wheel files to generate indices."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--version",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Version string for the current build (e.g., commit hash)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--current-objects",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory to store generated index files",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--alias-to-default",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Alias variant name for the default variant",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--comment",
|
|
||||||
type=str,
|
|
||||||
default="",
|
|
||||||
help="Optional comment string to include in generated HTML files",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
version = args.version
|
|
||||||
if "/" in version or "\\" in version:
|
|
||||||
raise ValueError("Version string must not contain slashes.")
|
|
||||||
current_objects_path = Path(args.current_objects)
|
|
||||||
output_dir = Path(args.output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Read current objects JSON
|
|
||||||
with open(current_objects_path) as f:
|
|
||||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
|
||||||
|
|
||||||
# current_objects looks like from list_objects_v2 S3 API:
|
|
||||||
"""
|
|
||||||
"Contents": [
|
|
||||||
{
|
|
||||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
|
||||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
|
||||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
|
||||||
"ChecksumAlgorithm": [
|
|
||||||
"CRC64NVME"
|
|
||||||
],
|
|
||||||
"ChecksumType": "FULL_OBJECT",
|
|
||||||
"Size": 435649349,
|
|
||||||
"StorageClass": "STANDARD"
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Extract wheel file keys
|
|
||||||
wheel_files = []
|
|
||||||
for item in current_objects.get("Contents", []):
|
|
||||||
key: str = item["Key"]
|
|
||||||
if key.endswith(".whl"):
|
|
||||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
|
||||||
|
|
||||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
|
||||||
|
|
||||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
|
||||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
|
||||||
if PY_VERSION_RE.match(version):
|
|
||||||
# upload-wheels.sh ensures no "dev" is in args.version
|
|
||||||
wheel_files = list(
|
|
||||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
|
||||||
)
|
|
||||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
|
||||||
else:
|
|
||||||
print("Nightly version detected, keeping all wheel files.")
|
|
||||||
|
|
||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
|
||||||
# s3://vllm-wheels/{version}/<wheel files>
|
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
|
||||||
wheel_base_dir = Path(output_dir).parent / version
|
|
||||||
index_base_dir = Path(output_dir)
|
|
||||||
|
|
||||||
generate_index_and_metadata(
|
|
||||||
whl_files=wheel_files,
|
|
||||||
wheel_base_dir=wheel_base_dir,
|
|
||||||
index_base_dir=index_base_dir,
|
|
||||||
default_variant=None,
|
|
||||||
alias_to_default=args.alias_to_default,
|
|
||||||
comment=args.comment.strip(),
|
|
||||||
)
|
|
||||||
print(f"Successfully generated index and metadata in {output_dir}")
|
|
||||||
@@ -78,13 +78,21 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
|
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||||
@@ -100,6 +108,7 @@ fi
|
|||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
if [[ $commands == *" kernels/attention"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
|
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
@@ -113,6 +122,7 @@ fi
|
|||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
--ignore=kernels/quantization/test_int8_quant.py \
|
||||||
|
--ignore=kernels/quantization/test_aqlm.py \
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
--ignore=kernels/quantization/test_machete_mm.py \
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
--ignore=kernels/quantization/test_block_fp8.py \
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
--ignore=kernels/quantization/test_block_int8.py \
|
||||||
@@ -141,6 +151,7 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
|
|||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||||
@@ -155,9 +166,16 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
|||||||
--ignore=entrypoints/llm/test_chat.py \
|
--ignore=entrypoints/llm/test_chat.py \
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
--ignore=entrypoints/llm/test_accuracy.py \
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
--ignore=entrypoints/llm/test_init.py \
|
||||||
|
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
#Obsolete currently
|
||||||
|
##ignore certain Entrypoints/llm tests
|
||||||
|
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
|
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
|
#fi
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
@@ -168,28 +186,19 @@ fi
|
|||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
MYPYTHONPATH=".."
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
# Test that we're launching on the machine that has
|
|
||||||
# proper access to GPUs
|
|
||||||
render_gid=$(getent group render | cut -d: -f3)
|
|
||||||
if [[ -z "$render_gid" ]]; then
|
|
||||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
# assign shard-id for each shard
|
# assign shard-id for each shard
|
||||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--group-add "$render_gid" \
|
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
@@ -221,8 +230,8 @@ else
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--group-add "$render_gid" \
|
|
||||||
--rm \
|
--rm \
|
||||||
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
|
|||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# allow to bind to different cores
|
|
||||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run model tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
|
||||||
|
|
||||||
# basic online serving
|
|
||||||
docker exec cpu-test bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model Qwen/Qwen3-0.6B \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2h bash -c cpu_tests
|
|
||||||
@@ -25,30 +25,25 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
set -e
|
||||||
set -xve
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
set -e
|
||||||
set -evx
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator tblib
|
pip install sentence-transformers datamodel_code_generator
|
||||||
|
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
# Note: disable Bart until supports V1
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 120m bash -c cpu_tests
|
timeout 40m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ set -ex
|
|||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
# used for TP/PP E2E test
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
@@ -21,12 +20,12 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
@@ -46,75 +45,58 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
# Note: disable until supports V1
|
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
pytest -v -s tests/models/language/generation -m cpu_model
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
pytest -v -s tests/models/multimodal/generation \
|
||||||
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
-m cpu_model"
|
-m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ/GPTQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
VLLM_USE_V1=0 pytest -s -v \
|
||||||
tests/quantization/test_cpu_wna16.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
|
# Run chunked-prefill and prefix-cache test
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v -k cpu_model \
|
||||||
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
|
# online serving
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
# online serving: tp+pp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
|
|
||||||
# online serving: tp+dp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
|||||||
@@ -16,7 +16,8 @@ DOCKER_BUILDKIT=1 docker build . \
|
|||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
--build-arg RUN_WHEEL_CHECK=false \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX"
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||||
|
|||||||
@@ -6,17 +6,19 @@ set -exuo pipefail
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||||
FROM gaudi-base-image:latest
|
FROM 1.22-413-pt2.7.1:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements/hpu.txt
|
||||||
|
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
ENV no_proxy=localhost,127.0.0.1
|
ENV no_proxy=localhost,127.0.0.1
|
||||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|||||||
64
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
64
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -e
|
||||||
|
set -v
|
||||||
|
|
||||||
|
image_name="neuron/vllm-ci"
|
||||||
|
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||||
|
|
||||||
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||||
|
|
||||||
|
# prune old image and containers to save disk space, and only once a day
|
||||||
|
# by using a timestamp file in tmp.
|
||||||
|
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||||
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
|
current_time=$(date +%s)
|
||||||
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune -f
|
||||||
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
${image_name} \
|
||||||
|
/bin/bash -c "
|
||||||
|
set -e; # Exit on first error
|
||||||
|
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||||
|
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||||
|
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||||
|
echo \"Running test file: \$f\";
|
||||||
|
python3 -m pytest \$f -v --capture=tee-sys;
|
||||||
|
done
|
||||||
|
"
|
||||||
@@ -1,192 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ENV SOC_VERSION="ascend910b1"
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
||||||
@@ -1,166 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 2 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 3 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
run_and_track_test 7 "test_tpu_int8.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@@ -5,6 +5,7 @@ set -xu
|
|||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f tpu-test || true;
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
@@ -61,15 +62,15 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
tpu-info
|
||||||
echo "--- Starting Tests ---"
|
echo "--- Starting Tests ---"
|
||||||
set +e
|
set +e
|
||||||
overall_script_exit_code=0
|
overall_script_exit_code=0
|
||||||
@@ -148,6 +149,18 @@ run_and_track_test 9 "test_multimodal.py" \
|
|||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
|
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 12 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 13 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
# After all tests have been attempted, exit with the overall status.
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
|||||||
@@ -20,33 +20,13 @@ trap remove_docker_container EXIT
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/dri:/dev/dri \
|
--device /dev/dri \
|
||||||
--net=host \
|
|
||||||
--ipc=host \
|
|
||||||
--privileged \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
|
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
bash -c '
|
sh -c '
|
||||||
set -e
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
echo $ZE_AFFINITY_MASK
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
pip install tblib==3.1.0
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
|
||||||
cd tests
|
|
||||||
pytest -v -s v1/core
|
|
||||||
pytest -v -s v1/engine
|
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
|
||||||
pytest -v -s v1/structured_output
|
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -11,20 +11,20 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
vllm bench serve \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
|
||||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8010}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
|
||||||
THRESHOLD=${1:-0.8}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8020}
|
|
||||||
DATA_PARALLEL_SIZE=${4:-2}
|
|
||||||
TENSOR_PARALLEL_SIZE=${5:-2}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--enable-eplb \
|
|
||||||
--all2all-backend $BACK \
|
|
||||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
|
||||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
|
||||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8040}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--all2all-backend $BACK \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
|
||||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
|
|||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f && docker system prune --force --filter "until=24h" --all
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
echo "Docker images and volumes cleanup completed."
|
echo "Docker images and volumes cleanup completed."
|
||||||
else
|
else
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8b
|
TEST_NAME=llama8b
|
||||||
CONTAINER_NAME=tpu-test
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ source /etc/environment
|
|||||||
source $ENV_FILE
|
source $ENV_FILE
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
docker rm -f $CONTAINER_NAME || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -20,6 +22,16 @@ trap remove_docker_container EXIT
|
|||||||
# Remove the container that might not be cleaned up in the previous run.
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build docker image.
|
||||||
|
# TODO: build the image outside the script and share the image with other
|
||||||
|
# tpu test if building time is too long.
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg GIT_REPO_CHECK=0 \
|
||||||
|
--tag vllm/vllm-tpu-bm \
|
||||||
|
--progress plain -f docker/Dockerfile.tpu .
|
||||||
|
|
||||||
LOG_ROOT=$(mktemp -d)
|
LOG_ROOT=$(mktemp -d)
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
echo "Results will be stored in: $LOG_ROOT"
|
echo "Results will be stored in: $LOG_ROOT"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8bw8a8
|
TEST_NAME=llama8bw8a8
|
||||||
CONTAINER_NAME=tpu-test
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||||
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=8.7
|
EXPECTED_THROUGHPUT=10.0
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
|||||||
@@ -42,8 +42,9 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vllm serve $MODEL \
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
|
--disable-log-requests \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
@@ -76,7 +77,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
vllm bench serve \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
|||||||
@@ -2,28 +2,6 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# ======== part 0: setup ========
|
|
||||||
|
|
||||||
BUCKET="vllm-wheels"
|
|
||||||
INDICES_OUTPUT_DIR="indices"
|
|
||||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
|
||||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
|
||||||
SUBPATH=$BUILDKITE_COMMIT
|
|
||||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
|
||||||
|
|
||||||
# detect if python3.10+ is available
|
|
||||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
|
||||||
if [[ "$has_new_python" -eq 0 ]]; then
|
|
||||||
# use new python from docker
|
|
||||||
docker pull python:3-slim
|
|
||||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Using python interpreter: $PYTHON"
|
|
||||||
echo "Python version: $($PYTHON --version)"
|
|
||||||
|
|
||||||
# ========= part 1: collect, rename & upload the wheel ==========
|
|
||||||
|
|
||||||
# Assume wheels are in artifacts/dist/*.whl
|
# Assume wheels are in artifacts/dist/*.whl
|
||||||
wheel_files=(artifacts/dist/*.whl)
|
wheel_files=(artifacts/dist/*.whl)
|
||||||
|
|
||||||
@@ -32,76 +10,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
|||||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Get the single wheel file
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||||
# we also accept params as manylinux tag
|
new_wheel="${wheel/linux/manylinux1}"
|
||||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
|
||||||
manylinux_version="${1:-manylinux_2_31}"
|
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
|
||||||
if [[ "$wheel" != *"linux"* ]]; then
|
|
||||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
new_wheel="${wheel/linux/$manylinux_version}"
|
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
echo "Renamed wheel to: $wheel"
|
|
||||||
|
|
||||||
# Extract the version from the wheel
|
# Extract the version from the wheel
|
||||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version in wheel: $version"
|
echo "Version: $version"
|
||||||
pure_version="${version%%+*}"
|
|
||||||
echo "Pure version (without variant): $pure_version"
|
|
||||||
|
|
||||||
# copy wheel to its own bucket
|
normal_wheel="$wheel" # Save the original wheel filename
|
||||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
|
||||||
|
|
||||||
# ========= part 2: generate and upload indices ==========
|
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||||
# generate indices for all existing wheels in the commit directory
|
if [[ $version == *dev* ]]; then
|
||||||
# this script might be run multiple times if there are multiple variants being built
|
suffix="${version##*.}"
|
||||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
if [[ $suffix == cu* ]]; then
|
||||||
# i.e., one process is generating indices while another is uploading a new wheel
|
new_version="1.0.0.dev+${suffix}"
|
||||||
# so we need to ensure no time-consuming operations happen below
|
else
|
||||||
|
new_version="1.0.0.dev"
|
||||||
|
fi
|
||||||
|
new_wheel="${wheel/$version/$new_version}"
|
||||||
|
# use cp to keep both files in the artifacts directory
|
||||||
|
cp -- "$wheel" "$new_wheel"
|
||||||
|
wheel="$new_wheel"
|
||||||
|
version="$new_version"
|
||||||
|
fi
|
||||||
|
|
||||||
# list all wheels in the commit directory
|
# Upload the wheel to S3
|
||||||
echo "Existing wheels on S3:"
|
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
|
||||||
obj_json="objects.json"
|
|
||||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
|
||||||
|
|
||||||
# call script to generate indicies for all existing wheels
|
# generate index for this commit
|
||||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
|
||||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
alias_arg=""
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
# generate index for nightly
|
||||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
|
||||||
|
|
||||||
# copy indices to /<commit>/ unconditionally
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
echo "Skipping index files for cu126 wheels"
|
||||||
echo "Uploading indices to overwrite /nightly/"
|
else
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
if [[ "$version" != *"dev"* ]]; then
|
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||||
echo "Re-generating indices for /$pure_version/"
|
|
||||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
|
||||||
fi
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,21 +0,0 @@
|
|||||||
group: Attention
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: V1 attention (H100)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/attention
|
|
||||||
|
|
||||||
- label: V1 attention (B200)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
group: Basic Correctness
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Basic Correctness
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/basic_correctness/test_basic_correctness
|
|
||||||
- tests/basic_correctness/test_cpu_offload
|
|
||||||
- tests/basic_correctness/test_cumem.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
group: Benchmarks
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Benchmarks
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
|
||||||
source_file_dependencies:
|
|
||||||
- benchmarks/
|
|
||||||
commands:
|
|
||||||
- bash scripts/run-benchmarks.sh
|
|
||||||
|
|
||||||
- label: Benchmarks CLI Test
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
group: Compile
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Fusion and Compile Tests (B200)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml
|
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
||||||
|
|
||||||
- label: Fusion E2E (2 GPUs)(B200)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
group: CUDA
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Platform Tests (CUDA)
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/cuda
|
|
||||||
commands:
|
|
||||||
- pytest -v -s cuda/test_cuda_context.py
|
|
||||||
|
|
||||||
- label: Cudagraph
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/v1/cudagraph
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/config/compilation.py
|
|
||||||
- vllm/compilation
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
|
||||||
@@ -1,199 +0,0 @@
|
|||||||
group: Distributed
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Distributed Comm Ops
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed
|
|
||||||
- tests/distributed
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_comm_ops.py
|
|
||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
|
||||||
- pytest -v -s distributed/test_shm_buffer.py
|
|
||||||
- pytest -v -s distributed/test_shm_storage.py
|
|
||||||
|
|
||||||
- label: Distributed (2 GPUs)
|
|
||||||
timeout_in_minutes: 90
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/compilation/
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/worker/worker_base.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
|
||||||
- tests/compile/test_wrapper.py
|
|
||||||
- tests/distributed/
|
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
|
||||||
- tests/v1/distributed
|
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- tests/v1/shutdown
|
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- tests/distributed/test_utils
|
|
||||||
- tests/distributed/test_pynccl
|
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
|
||||||
- examples/offline_inference/rlhf.py
|
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
- tests/v1/distributed
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and external_dp=2
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with torchrun tp=2 and pp=2
|
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with torchrun tp=4 and dp=1
|
|
||||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2, pp=2 and dp=1
|
|
||||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=1 and dp=4 with ep
|
|
||||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2 and dp=2 with ep
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
|
||||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
|
||||||
# when we have multiple distributed example tests
|
|
||||||
- cd ../examples/offline_inference
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (8 GPUs)(H100)
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 8
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- examples/offline_inference/torchrun_dp_example.py
|
|
||||||
- vllm/config/parallel.py
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/v1/engine/llm_engine.py
|
|
||||||
- vllm/v1/executor/uniproc_executor.py
|
|
||||||
- vllm/v1/worker/gpu_worker.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and dp=4 with ep
|
|
||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)(A100)
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
commands:
|
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
|
||||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
|
||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(H200)
|
|
||||||
gpu: h200
|
|
||||||
optional: true
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(B200)
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
||||||
|
|
||||||
- label: 2 Node Test (4 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
num_nodes: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/distributed/
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
commands:
|
|
||||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
|
||||||
|
|
||||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/distributed/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
group: E2E Integration
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
|
||||||
|
|
||||||
- label: Prime-RL Integration (2 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
optional: true
|
|
||||||
soft_fail: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
group: Engine
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Engine
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/engine
|
|
||||||
- tests/test_sequence
|
|
||||||
- tests/test_config
|
|
||||||
- tests/test_logger
|
|
||||||
- tests/test_vllm_port
|
|
||||||
commands:
|
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
|
||||||
|
|
||||||
- label: V1 e2e + engine
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
# TODO: accuracy does not match, whether setting
|
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
||||||
- pytest -v -s v1/e2e
|
|
||||||
- pytest -v -s v1/engine
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
group: Entrypoints
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Entrypoints Unit Tests
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/entrypoints
|
|
||||||
- tests/entrypoints/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s entrypoints/openai/tool_parsers
|
|
||||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (LLM)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/llm
|
|
||||||
- tests/entrypoints/offline_mode
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 1)
|
|
||||||
timeout_in_minutes: 130
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai
|
|
||||||
- tests/entrypoints/test_chat_utils
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 2)
|
|
||||||
timeout_in_minutes: 130
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
- tests/entrypoints/sleep
|
|
||||||
- tests/entrypoints/instrumentator
|
|
||||||
- tests/entrypoints/rpc
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
|
||||||
- pytest -v -s entrypoints/instrumentator
|
|
||||||
- pytest -v -s entrypoints/sleep
|
|
||||||
- pytest -v -s tool_use
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (Pooling)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/pooling
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/pooling
|
|
||||||
|
|
||||||
|
|
||||||
- label: Entrypoints V1
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/entrypoints
|
|
||||||
|
|
||||||
- label: OpenAI API Correctness
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/entrypoints/openai/
|
|
||||||
- vllm/model_executor/models/whisper.py
|
|
||||||
commands: # LMEval+Transcription WER check
|
|
||||||
- pytest -s entrypoints/openai/correctness/
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
group: Expert Parallelism
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: EPLB Algorithm
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_algo.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
|
||||||
|
|
||||||
- label: EPLB Execution
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_execute.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
|
||||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
|
||||||
@@ -1,117 +0,0 @@
|
|||||||
group: Kernels
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Kernels Core Operation Test
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- tests/kernels/core
|
|
||||||
- tests/kernels/test_top_k_per_row.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
timeout_in_minutes: 90
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test %N
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
- vllm/distributed/device_communicators/
|
|
||||||
- vllm/envs.py
|
|
||||||
- vllm/config
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
- vllm/model_executor/layers/mamba/ops
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Kernels DeepGEMM Test (H100)
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 1
|
|
||||||
source_file_dependencies:
|
|
||||||
- tools/install_deepgemm.sh
|
|
||||||
- vllm/utils/deep_gemm.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization/test_block_fp8.py
|
|
||||||
- tests/kernels/moe/test_deepgemm.py
|
|
||||||
- tests/kernels/moe/test_batched_deepgemm.py
|
|
||||||
- tests/kernels/attention/test_deepgemm_attention.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
|
||||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
|
||||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
|
||||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
|
||||||
|
|
||||||
- label: Kernels (B200)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
# optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- csrc/attention/mla/
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
|
||||||
- vllm/platforms/cuda.py
|
|
||||||
- vllm/attention/selector.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
|
||||||
# Attention
|
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
|
||||||
# Quantization
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
group: LM Eval
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: LM Eval Small Models
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
autorun_on_main: true
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: LM Eval Small Models (B200)
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
group: LoRA
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: LoRA %N
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/lora
|
|
||||||
- tests/lora
|
|
||||||
commands:
|
|
||||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
|
||||||
parallelism: 4
|
|
||||||
|
|
||||||
|
|
||||||
- label: LoRA TP (Distributed)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/lora
|
|
||||||
- tests/lora
|
|
||||||
commands:
|
|
||||||
# FIXIT: find out which code initialize cuda before running the test
|
|
||||||
# before the fix, we need to use spawn to test it
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
# Alot of these tests are on the edge of OOMing
|
|
||||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
|
||||||
# requires multi-GPU testing for validation.
|
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
|
||||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
group: Miscellaneous
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: V1 Others
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
|
||||||
- pytest -v -s v1/executor
|
|
||||||
- pytest -v -s v1/kv_offload
|
|
||||||
- pytest -v -s v1/sample
|
|
||||||
- pytest -v -s v1/logits_processors
|
|
||||||
- pytest -v -s v1/worker
|
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
|
||||||
- pytest -v -s v1/test_request.py
|
|
||||||
- pytest -v -s v1/test_outputs.py
|
|
||||||
# Integration test for streaming correctness (requires special branch).
|
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
||||||
|
|
||||||
- label: V1 Others (CPU)
|
|
||||||
depends_on: ~
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/core
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
||||||
|
|
||||||
- label: Regression
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_regression
|
|
||||||
commands:
|
|
||||||
- pip install modelscope
|
|
||||||
- pytest -v -s test_regression.py
|
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
|
||||||
|
|
||||||
- label: Examples
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
working_dir: "/vllm-workspace/examples"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/entrypoints
|
|
||||||
- vllm/multimodal
|
|
||||||
- examples/
|
|
||||||
commands:
|
|
||||||
- pip install tensorizer # for tensorizer test
|
|
||||||
- python3 offline_inference/basic/chat.py # for basic
|
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
|
||||||
- python3 offline_inference/basic/classify.py
|
|
||||||
- python3 offline_inference/basic/embed.py
|
|
||||||
- python3 offline_inference/basic/score.py
|
|
||||||
# for multi-modal models
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
# for pooling models
|
|
||||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
|
||||||
# for features demo
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
|
||||||
|
|
||||||
- label: Metrics, Tracing (2 GPUs)
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1/tracing
|
|
||||||
commands:
|
|
||||||
- "pip install \
|
|
||||||
'opentelemetry-sdk>=1.26.0' \
|
|
||||||
'opentelemetry-api>=1.26.0' \
|
|
||||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
|
||||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
|
||||||
- pytest -v -s v1/tracing
|
|
||||||
|
|
||||||
- label: Python-only Installation
|
|
||||||
depends_on: ~
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
|
||||||
- setup.py
|
|
||||||
commands:
|
|
||||||
- bash standalone_tests/python_only_compile.sh
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/multimodal
|
|
||||||
- tests/utils_
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
|
||||||
- pytest -v -s utils_
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
|
||||||
depends_on: ~
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_inputs.py
|
|
||||||
- tests/test_outputs.py
|
|
||||||
- tests/multimodal
|
|
||||||
- tests/standalone_tests/lazy_imports.py
|
|
||||||
- tests/tokenizers_
|
|
||||||
- tests/tool_parsers
|
|
||||||
- tests/transformers_utils
|
|
||||||
- tests/config
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- python3 standalone_tests/lazy_imports.py
|
|
||||||
- pytest -v -s test_inputs.py
|
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
|
||||||
- pytest -v -s tokenizers_
|
|
||||||
- pytest -v -s tool_parsers
|
|
||||||
- pytest -v -s transformers_utils
|
|
||||||
- pytest -v -s config
|
|
||||||
|
|
||||||
- label: GPT-OSS Eval (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/evals/gpt_oss
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
||||||
|
|
||||||
- label: Batch Invariance (H100)
|
|
||||||
timeout_in_minutes: 25
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- tests/v1/determinism/
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pip install pytest-timeout pytest-forked
|
|
||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user