Compare commits
2 Commits
v0.16.0rc0
...
v0.2.1.pos
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d40c834f0 | ||
|
|
d0fb047de3 |
@@ -1,53 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import zipfile
|
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
|
||||||
# Please also sync the value with the one in Dockerfile.
|
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
|
||||||
with zipfile.ZipFile(zip_file, "r") as z:
|
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
for f, size in file_sizes[:10]:
|
|
||||||
print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
|
|
||||||
|
|
||||||
|
|
||||||
def check_wheel_size(directory):
|
|
||||||
"""Check the size of .whl files in the given directory."""
|
|
||||||
for root, _, files in os.walk(directory):
|
|
||||||
for file_name in files:
|
|
||||||
if file_name.endswith(".whl"):
|
|
||||||
wheel_path = os.path.join(root, file_name)
|
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
|
||||||
print(
|
|
||||||
f"Not allowed: Wheel {wheel_path} is larger "
|
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
|
||||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
|
||||||
)
|
|
||||||
print_top_10_largest_files(wheel_path)
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
print(
|
|
||||||
f"Wheel {wheel_path} is within the allowed size "
|
|
||||||
f"({wheel_size_mb:.2f} MB)."
|
|
||||||
)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("Usage: python check-wheel-size.py <directory>")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
directory = sys.argv[1]
|
|
||||||
sys.exit(check_wheel_size(directory))
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
name: vllm_ci
|
|
||||||
job_dirs:
|
|
||||||
- ".buildkite/image_build"
|
|
||||||
- ".buildkite/test_areas"
|
|
||||||
- ".buildkite/hardware_tests"
|
|
||||||
run_all_patterns:
|
|
||||||
- "docker/Dockerfile"
|
|
||||||
- "CMakeLists.txt"
|
|
||||||
- "requirements/common.txt"
|
|
||||||
- "requirements/cuda.txt"
|
|
||||||
- "requirements/build.txt"
|
|
||||||
- "requirements/test.txt"
|
|
||||||
- "setup.py"
|
|
||||||
- "csrc/"
|
|
||||||
- "cmake/"
|
|
||||||
run_all_exclude_patterns:
|
|
||||||
- "docker/Dockerfile."
|
|
||||||
- "csrc/cpu/"
|
|
||||||
- "csrc/rocm/"
|
|
||||||
- "cmake/hipify.py"
|
|
||||||
- "cmake/cpu_extension.cmake"
|
|
||||||
registries: public.ecr.aws/q9t5s3a7
|
|
||||||
repositories:
|
|
||||||
main: "vllm-ci-postmerge-repo"
|
|
||||||
premerge: "vllm-ci-test-repo"
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
group: Hardware
|
|
||||||
steps:
|
|
||||||
- label: "AMD: :docker: build image"
|
|
||||||
depends_on: []
|
|
||||||
device: amd_cpu
|
|
||||||
no_plugin: true
|
|
||||||
commands:
|
|
||||||
- >
|
|
||||||
docker build
|
|
||||||
--build-arg max_jobs=16
|
|
||||||
--build-arg REMOTE_VLLM=1
|
|
||||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
|
|
||||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
|
||||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
-f docker/Dockerfile.rocm
|
|
||||||
--target test
|
|
||||||
--no-cache
|
|
||||||
--progress plain .
|
|
||||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 1
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 1
|
|
||||||
- exit_status: 1 # Machine occasionally fail
|
|
||||||
limit: 1
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
group: Hardware
|
|
||||||
steps:
|
|
||||||
- label: "Arm CPU Test"
|
|
||||||
soft_fail: true
|
|
||||||
device: arm_cpu
|
|
||||||
no_plugin: true
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
group: Hardware
|
|
||||||
depends_on: ~
|
|
||||||
steps:
|
|
||||||
- label: "Ascend NPU Test"
|
|
||||||
soft_fail: true
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
no_plugin: true
|
|
||||||
device: ascend_npu
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
group: Hardware
|
|
||||||
steps:
|
|
||||||
- label: "GH200 Test"
|
|
||||||
soft_fail: true
|
|
||||||
device: gh200
|
|
||||||
no_plugin: true
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
group: Hardware
|
|
||||||
depends_on: ~
|
|
||||||
steps:
|
|
||||||
- label: "Intel CPU Test"
|
|
||||||
soft_fail: true
|
|
||||||
device: intel_cpu
|
|
||||||
no_plugin: true
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
|
|
||||||
|
|
||||||
- label: "Intel HPU Test"
|
|
||||||
soft_fail: true
|
|
||||||
device: intel_hpu
|
|
||||||
no_plugin: true
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
|
||||||
|
|
||||||
- label: "Intel GPU Test"
|
|
||||||
depends_on: []
|
|
||||||
soft_fail: true
|
|
||||||
device: intel_gpu
|
|
||||||
no_plugin: true
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
|
||||||
@@ -1,256 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# replace invalid characters in Docker image tags and truncate to 128 chars
|
|
||||||
clean_docker_tag() {
|
|
||||||
local input="$1"
|
|
||||||
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
|
||||||
}
|
|
||||||
|
|
||||||
print_usage_and_exit() {
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
print_instance_info() {
|
|
||||||
echo ""
|
|
||||||
echo "=== Debug: Instance Information ==="
|
|
||||||
# Get IMDSv2 token
|
|
||||||
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
|
||||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
|
||||||
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
|
||||||
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
|
||||||
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
|
||||||
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
|
||||||
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
|
||||||
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
|
||||||
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
|
||||||
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
|
||||||
echo "AMI ID: ${AMI_ID}"
|
|
||||||
echo "Instance Type: ${INSTANCE_TYPE}"
|
|
||||||
echo "Instance ID: ${INSTANCE_ID}"
|
|
||||||
echo "AZ: ${AZ}"
|
|
||||||
else
|
|
||||||
echo "Not running on EC2 or IMDS not available"
|
|
||||||
fi
|
|
||||||
# Check for warm cache AMI (marker file baked into custom AMI)
|
|
||||||
if [[ -f /etc/vllm-ami-info ]]; then
|
|
||||||
echo "Cache: warm (custom vLLM AMI)"
|
|
||||||
cat /etc/vllm-ami-info
|
|
||||||
else
|
|
||||||
echo "Cache: cold (standard AMI)"
|
|
||||||
fi
|
|
||||||
echo "==================================="
|
|
||||||
echo ""
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_buildx_builder() {
|
|
||||||
echo "--- :buildkite: Setting up buildx builder"
|
|
||||||
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
|
||||||
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
|
||||||
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
|
||||||
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
|
||||||
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
|
||||||
echo "Using existing baked-vllm-builder"
|
|
||||||
docker buildx use baked-vllm-builder
|
|
||||||
else
|
|
||||||
echo "Creating baked-vllm-builder with remote driver"
|
|
||||||
docker buildx create \
|
|
||||||
--name baked-vllm-builder \
|
|
||||||
--driver remote \
|
|
||||||
--use \
|
|
||||||
"unix://${BUILDKIT_SOCKET}"
|
|
||||||
fi
|
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
|
||||||
# Existing builder available
|
|
||||||
echo "Using existing builder: ${BUILDER_NAME}"
|
|
||||||
docker buildx use "${BUILDER_NAME}"
|
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
else
|
|
||||||
# No local buildkitd, no existing builder - create new docker-container builder
|
|
||||||
echo "No local buildkitd found, using docker-container driver"
|
|
||||||
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
fi
|
|
||||||
|
|
||||||
# builder info
|
|
||||||
echo "Active builder:"
|
|
||||||
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
|
||||||
}
|
|
||||||
|
|
||||||
check_and_skip_if_image_exists() {
|
|
||||||
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
|
||||||
echo "--- :mag: Checking if image exists"
|
|
||||||
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
|
||||||
echo "Image already exists: ${IMAGE_TAG}"
|
|
||||||
echo "Skipping build"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
echo "Image not found, proceeding with build"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
ecr_login() {
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
|
||||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_cache_tags() {
|
|
||||||
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
|
||||||
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
|
||||||
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
|
||||||
|
|
||||||
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
|
||||||
cache="${MAIN_CACHE_ECR}:latest"
|
|
||||||
else
|
|
||||||
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
|
||||||
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
|
||||||
fi
|
|
||||||
CACHE_TO="$cache"
|
|
||||||
CACHE_FROM="$cache"
|
|
||||||
CACHE_FROM_BASE_BRANCH="$cache"
|
|
||||||
else
|
|
||||||
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
|
||||||
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
|
||||||
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
|
||||||
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
|
||||||
else
|
|
||||||
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
|
||||||
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
|
||||||
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
|
||||||
}
|
|
||||||
|
|
||||||
resolve_parent_commit() {
|
|
||||||
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
|
||||||
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
|
||||||
if [[ -n "${PARENT_COMMIT}" ]]; then
|
|
||||||
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
|
||||||
export PARENT_COMMIT
|
|
||||||
else
|
|
||||||
echo "Could not determine parent commit (may be first commit in repo)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
print_bake_config() {
|
|
||||||
echo "--- :page_facing_up: Resolved bake configuration"
|
|
||||||
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
|
||||||
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
|
||||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
|
||||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
|
||||||
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
|
|
||||||
}
|
|
||||||
|
|
||||||
#################################
|
|
||||||
# Main Script #
|
|
||||||
#################################
|
|
||||||
print_instance_info
|
|
||||||
|
|
||||||
if [[ $# -lt 7 ]]; then
|
|
||||||
print_usage_and_exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
# input args
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
BRANCH=$4
|
|
||||||
VLLM_USE_PRECOMPILED=$5
|
|
||||||
VLLM_MERGE_BASE_COMMIT=$6
|
|
||||||
IMAGE_TAG=$7
|
|
||||||
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
|
|
||||||
|
|
||||||
# build config
|
|
||||||
TARGET="test-ci"
|
|
||||||
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
|
|
||||||
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
|
||||||
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
|
||||||
CI_HCL_PATH="/tmp/ci.hcl"
|
|
||||||
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
|
||||||
|
|
||||||
prepare_cache_tags
|
|
||||||
ecr_login
|
|
||||||
|
|
||||||
# Environment info (for docs and human readers)
|
|
||||||
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
|
||||||
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
|
||||||
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
|
||||||
#
|
|
||||||
# Build configuration (exported as environment variables for bake):
|
|
||||||
export BUILDKITE_COMMIT
|
|
||||||
export PARENT_COMMIT
|
|
||||||
export IMAGE_TAG
|
|
||||||
export IMAGE_TAG_LATEST
|
|
||||||
export CACHE_FROM
|
|
||||||
export CACHE_FROM_BASE_BRANCH
|
|
||||||
export CACHE_FROM_MAIN
|
|
||||||
export CACHE_TO
|
|
||||||
export VLLM_USE_PRECOMPILED
|
|
||||||
export VLLM_MERGE_BASE_COMMIT
|
|
||||||
|
|
||||||
# print args
|
|
||||||
echo "--- :mag: Arguments"
|
|
||||||
echo "REGISTRY: ${REGISTRY}"
|
|
||||||
echo "REPO: ${REPO}"
|
|
||||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
|
||||||
echo "BRANCH: ${BRANCH}"
|
|
||||||
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
|
|
||||||
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
|
|
||||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
|
||||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
|
||||||
|
|
||||||
# print build configuration
|
|
||||||
echo "--- :mag: Build configuration"
|
|
||||||
echo "TARGET: ${TARGET}"
|
|
||||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
|
||||||
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
|
||||||
echo "CI_HCL_URL: ${CI_HCL_URL}"
|
|
||||||
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
|
||||||
|
|
||||||
echo "--- :mag: Cache tags"
|
|
||||||
echo "CACHE_TO: ${CACHE_TO}"
|
|
||||||
echo "CACHE_FROM: ${CACHE_FROM}"
|
|
||||||
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
|
||||||
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
|
||||||
|
|
||||||
check_and_skip_if_image_exists
|
|
||||||
|
|
||||||
echo "--- :docker: Setting up Docker buildx bake"
|
|
||||||
echo "Target: ${TARGET}"
|
|
||||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
|
||||||
echo "CI HCL path: ${CI_HCL_PATH}"
|
|
||||||
|
|
||||||
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
|
|
||||||
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
|
|
||||||
echo "Make sure you're running from the vLLM repository root"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "--- :arrow_down: Downloading ci.hcl"
|
|
||||||
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
|
||||||
echo "Downloaded to ${CI_HCL_PATH}"
|
|
||||||
|
|
||||||
if [[ ! -f "${CI_HCL_PATH}" ]]; then
|
|
||||||
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
setup_buildx_builder
|
|
||||||
|
|
||||||
resolve_parent_commit
|
|
||||||
export PARENT_COMMIT
|
|
||||||
|
|
||||||
print_bake_config
|
|
||||||
|
|
||||||
echo "--- :docker: Building ${TARGET}"
|
|
||||||
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
|
||||||
|
|
||||||
echo "--- :white_check_mark: Build complete"
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
group: Abuild
|
|
||||||
steps:
|
|
||||||
- label: ":docker: Build image"
|
|
||||||
key: image-build
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
|
||||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU image"
|
|
||||||
key: image-build-cpu
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build HPU image"
|
|
||||||
soft_fail: true
|
|
||||||
depends_on: []
|
|
||||||
key: image-build-hpu
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU arm64 image"
|
|
||||||
key: cpu-arm64-image-build
|
|
||||||
depends_on: []
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
|
||||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
|
||||||
--build-arg VLLM_CPU_AMXBF16=true \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build \
|
|
||||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
|
||||||
--progress plain \
|
|
||||||
https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.671
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.664
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
trust_remote_code: True
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.905
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.905
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.892
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.892
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.752
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.754
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.753
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.753
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.755
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.755
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.753
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.753
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.764
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.764
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.728
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.728
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.758
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.759
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.756
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.752
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.419
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.416
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.356
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.358
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
|
||||||
value: 0.80
|
|
||||||
limit: 100
|
|
||||||
num_fewshot: 0
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.80
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
rtol: 0.05
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.231
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.22
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.86
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.86
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.624
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.624
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.616
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.632
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.578
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.585
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.593
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.588
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.792
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.824
|
|
||||||
limit: 250
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.54
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.59
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size)
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
|
||||||
|
|
||||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.855
|
|
||||||
limit: 2500
|
|
||||||
num_fewshot: 0
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.82
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
enforce_eager: false # we use false to speed up the eval process
|
|
||||||
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
|
||||||
max_model_len: 40960
|
|
||||||
apply_chat_template: true
|
|
||||||
fewshot_as_multiturn: true
|
|
||||||
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.6353
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.637
|
|
||||||
limit: null
|
|
||||||
num_fewshot: null
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
|
|
||||||
Meta-Llama-3-70B-Instruct.yaml
|
|
||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
Qwen2.5-VL-7B-Instruct.yaml
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our correctness tests in vllm's CI."
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:l:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm-vlm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
|
||||||
--tasks chartqa \
|
|
||||||
--batch_size auto \
|
|
||||||
--apply_chat_template \
|
|
||||||
--limit $LIMIT
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -b - batch size to run the evaluation at"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model hf \
|
|
||||||
--model_args "pretrained=$MODEL,parallelize=True" \
|
|
||||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size "$BATCH_SIZE"
|
|
||||||
@@ -1,51 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on GSM for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -b - batch size to run the evaluation at"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size "$BATCH_SIZE"
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size auto
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
|
||||||
Configs are found in configs/$MODEL.yaml
|
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
|
||||||
--config-list-file=configs/models-small.txt \
|
|
||||||
--tp-size=1
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
import lm_eval
|
|
||||||
import numpy as np
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
DEFAULT_RTOL = 0.08
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def scoped_env_vars(new_env: dict[str, str]):
|
|
||||||
if not new_env:
|
|
||||||
# Fast path: nothing to do
|
|
||||||
yield
|
|
||||||
return
|
|
||||||
|
|
||||||
old_values = {}
|
|
||||||
new_keys = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
for key, value in new_env.items():
|
|
||||||
if key in os.environ:
|
|
||||||
old_values[key] = os.environ[key]
|
|
||||||
else:
|
|
||||||
new_keys.append(key)
|
|
||||||
os.environ[key] = str(value)
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
# Restore / clean up
|
|
||||||
for key, value in old_values.items():
|
|
||||||
os.environ[key] = value
|
|
||||||
for key in new_keys:
|
|
||||||
os.environ.pop(key, None)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
|
||||||
backend = eval_config.get("backend", "vllm")
|
|
||||||
enforce_eager = eval_config.get("enforce_eager", "true")
|
|
||||||
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
|
||||||
model_args = (
|
|
||||||
f"pretrained={eval_config['model_name']},"
|
|
||||||
f"tensor_parallel_size={tp_size},"
|
|
||||||
f"enforce_eager={enforce_eager},"
|
|
||||||
f"kv_cache_dtype={kv_cache_dtype},"
|
|
||||||
f"add_bos_token=true,"
|
|
||||||
f"trust_remote_code={trust_remote_code},"
|
|
||||||
f"max_model_len={max_model_len},"
|
|
||||||
"allow_deprecated_quantization=True,"
|
|
||||||
)
|
|
||||||
|
|
||||||
env_vars = eval_config.get("env_vars", None)
|
|
||||||
with scoped_env_vars(env_vars):
|
|
||||||
results = lm_eval.simple_evaluate(
|
|
||||||
model=backend,
|
|
||||||
model_args=model_args,
|
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
|
||||||
limit=eval_config["limit"],
|
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
|
||||||
# text models. however, this is regressing measured strict-match for
|
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
|
||||||
apply_chat_template=eval_config.get(
|
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
|
||||||
),
|
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
|
||||||
|
|
||||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
|
||||||
|
|
||||||
success = True
|
|
||||||
for task in eval_config["tasks"]:
|
|
||||||
for metric in task["metrics"]:
|
|
||||||
ground_truth = metric["value"]
|
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
|
||||||
print(
|
|
||||||
f"{task['name']} | {metric['name']}: "
|
|
||||||
f"ground_truth={ground_truth:.3f} | "
|
|
||||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
|
||||||
)
|
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
|
||||||
|
|
||||||
assert success
|
|
||||||
@@ -1,181 +0,0 @@
|
|||||||
# vLLM benchmark suite
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
|
||||||
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
|
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
|
||||||
|
|
||||||
The benchmark needs to be triggered manually:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Runtime environment variables:
|
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
|
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
|
||||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
|
||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
|
|
||||||
## Performance benchmark details
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
|
||||||
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
|
||||||
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
|
|
||||||
|
|
||||||
### Latency test
|
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
In this example:
|
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
|
||||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
|
||||||
|
|
||||||
### Throughput test
|
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
|
||||||
|
|
||||||
### Serving test
|
|
||||||
|
|
||||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
Inside this example:
|
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
|
||||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
|
||||||
|
|
||||||
#### Default Parameters Field
|
|
||||||
|
|
||||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary> An Example of default parameters field </summary>
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"block_size": 128,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 200,
|
|
||||||
"ignore-eos": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### Visualizing the results
|
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
|
||||||
|
|
||||||
#### Performance Results Comparison
|
|
||||||
|
|
||||||
Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
# Performance benchmarks descriptions
|
|
||||||
|
|
||||||
## Latency tests
|
|
||||||
|
|
||||||
- Input length: 32 tokens.
|
|
||||||
- Output length: 128 tokens.
|
|
||||||
- Batch size: fixed (8).
|
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
|
||||||
|
|
||||||
## Throughput tests
|
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: throughput.
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
|
||||||
|
|
||||||
## Serving tests
|
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
|
||||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
|
||||||
|
|
||||||
## Platform Information
|
|
||||||
|
|
||||||
{platform_markdown_table}
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
benchmarking_results_json = """The json string"""
|
|
||||||
benchmarking_results = json.loads(benchmarking_results_json)
|
|
||||||
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
|
||||||
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
|
||||||
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|
||||||
```
|
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{benchmarking_results_in_json_string}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
|
||||||
@@ -1,825 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import html as _html
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from importlib import util
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
pd.options.display.float_format = "{:.2f}".format
|
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
|
||||||
|
|
||||||
DEFAULT_INFO_COLS = [
|
|
||||||
"Model",
|
|
||||||
"Dataset Name",
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
# "TP Size",
|
|
||||||
# "PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
|
|
||||||
pd.set_option("display.precision", 2)
|
|
||||||
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Core data compare
|
|
||||||
# -----------------------------
|
|
||||||
def compare_data_columns(
|
|
||||||
files: list[str],
|
|
||||||
name_column: str,
|
|
||||||
data_column: str,
|
|
||||||
info_cols: list[str],
|
|
||||||
drop_column: str,
|
|
||||||
debug: bool = False,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Align concatenation by keys derived from info_cols instead of row order.
|
|
||||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
|
||||||
- For each file: set index to those keys, aggregate duplicates
|
|
||||||
(mean for metric, first for names).
|
|
||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
|
||||||
group by columns.
|
|
||||||
- If --debug, add a <file_label>_name column per file.
|
|
||||||
"""
|
|
||||||
print("\ncompare_data_column:", data_column)
|
|
||||||
|
|
||||||
frames = []
|
|
||||||
raw_data_cols: list[str] = []
|
|
||||||
compare_frames = []
|
|
||||||
|
|
||||||
cols_per_file: list[set] = []
|
|
||||||
for f in files:
|
|
||||||
try:
|
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
|
||||||
except Exception as err:
|
|
||||||
raise ValueError(f"Failed to read {f}") from err
|
|
||||||
cols_per_file.append(set(df_tmp.columns))
|
|
||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
|
||||||
if not key_cols:
|
|
||||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
|
||||||
if not key_cols:
|
|
||||||
raise ValueError(
|
|
||||||
"No common key columns found from info_cols across the input files."
|
|
||||||
)
|
|
||||||
|
|
||||||
meta_added = False
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
df = pd.read_json(file, orient="records")
|
|
||||||
|
|
||||||
if drop_column in df.columns:
|
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
|
||||||
|
|
||||||
for c in (
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
):
|
|
||||||
if c in df.columns:
|
|
||||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
||||||
|
|
||||||
for c in key_cols:
|
|
||||||
if c not in df.columns:
|
|
||||||
df[c] = pd.NA
|
|
||||||
|
|
||||||
df_idx = df.set_index(key_cols, drop=False)
|
|
||||||
|
|
||||||
meta = df_idx[key_cols]
|
|
||||||
if not meta.index.is_unique:
|
|
||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
|
||||||
|
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
|
||||||
s = df_idx[data_column]
|
|
||||||
if not s.index.is_unique:
|
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
|
||||||
s.name = file_label
|
|
||||||
|
|
||||||
if not meta_added:
|
|
||||||
frames.append(meta)
|
|
||||||
meta_added = True
|
|
||||||
|
|
||||||
if debug and name_column in df_idx.columns:
|
|
||||||
name_s = df_idx[name_column]
|
|
||||||
if not name_s.index.is_unique:
|
|
||||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
|
||||||
name_s.name = f"{file_label}_name"
|
|
||||||
frames.append(name_s)
|
|
||||||
|
|
||||||
frames.append(s)
|
|
||||||
raw_data_cols.append(file_label)
|
|
||||||
compare_frames.append(s)
|
|
||||||
|
|
||||||
if len(compare_frames) >= 2:
|
|
||||||
base = compare_frames[0]
|
|
||||||
current = compare_frames[-1]
|
|
||||||
if "P99" in data_column or "Median" in data_column:
|
|
||||||
ratio = base / current
|
|
||||||
else:
|
|
||||||
ratio = current / base
|
|
||||||
ratio = ratio.mask(base == 0)
|
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
|
||||||
frames.append(ratio)
|
|
||||||
|
|
||||||
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
|
||||||
|
|
||||||
front = [c for c in info_cols if c in concat_df.columns]
|
|
||||||
rest = [c for c in concat_df.columns if c not in front]
|
|
||||||
concat_df = concat_df[front + rest]
|
|
||||||
|
|
||||||
print(raw_data_cols)
|
|
||||||
return concat_df, raw_data_cols
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Split helper
|
|
||||||
# -----------------------------
|
|
||||||
def split_json_by_tp_pp(
|
|
||||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
|
||||||
) -> list[str]:
|
|
||||||
with open(input_file, encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
if isinstance(data, dict):
|
|
||||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
|
||||||
if isinstance(data.get(key), list):
|
|
||||||
data = data[key]
|
|
||||||
break
|
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
name_col = next(
|
|
||||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
|
||||||
)
|
|
||||||
if name_col:
|
|
||||||
df = df[
|
|
||||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
|
||||||
].copy()
|
|
||||||
|
|
||||||
rename_map = {
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"tensor_parallel_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"pipeline_parallel_size": "PP Size",
|
|
||||||
}
|
|
||||||
df.rename(
|
|
||||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if "TP Size" not in df.columns:
|
|
||||||
df["TP Size"] = 1
|
|
||||||
if "PP Size" not in df.columns:
|
|
||||||
df["PP Size"] = 1
|
|
||||||
|
|
||||||
df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
|
|
||||||
df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
|
|
||||||
|
|
||||||
saved_paths: list[str] = []
|
|
||||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
|
||||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
|
||||||
os.makedirs(folder_name, exist_ok=True)
|
|
||||||
filepath = os.path.join(folder_name, "benchmark_results.json")
|
|
||||||
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
|
|
||||||
print(f"Saved: {filepath}")
|
|
||||||
saved_paths.append(filepath)
|
|
||||||
|
|
||||||
return saved_paths
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Styling helpers
|
|
||||||
# -----------------------------
|
|
||||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|
||||||
for c in [
|
|
||||||
"# of max concurrency.",
|
|
||||||
"# of max concurrency",
|
|
||||||
"Max Concurrency",
|
|
||||||
"max_concurrency",
|
|
||||||
"Concurrency",
|
|
||||||
]:
|
|
||||||
if c in df.columns:
|
|
||||||
return c
|
|
||||||
for c in df.columns:
|
|
||||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
|
||||||
return c
|
|
||||||
return "# of max concurrency."
|
|
||||||
|
|
||||||
|
|
||||||
def _highlight_threshold(
|
|
||||||
df: pd.DataFrame, threshold: float
|
|
||||||
) -> pd.io.formats.style.Styler:
|
|
||||||
conc_col = _find_concurrency_col(df)
|
|
||||||
key_cols = [
|
|
||||||
c
|
|
||||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
|
||||||
if c in df.columns
|
|
||||||
]
|
|
||||||
conf_cols = [
|
|
||||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
|
||||||
]
|
|
||||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
||||||
|
|
||||||
return df.style.map(
|
|
||||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
|
||||||
if pd.notna(v) and v <= threshold
|
|
||||||
else "",
|
|
||||||
subset=conf_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
|
||||||
ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
|
|
||||||
if not ratio_cols:
|
|
||||||
return styler
|
|
||||||
|
|
||||||
styler = styler.apply(
|
|
||||||
lambda _: ["background-color: #fff3b0"] * len(styler.data),
|
|
||||||
subset=ratio_cols,
|
|
||||||
axis=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
styler = styler.set_table_styles(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"selector": f"th.col_heading.level0.col{i}",
|
|
||||||
"props": [("background-color", "#fff3b0")],
|
|
||||||
}
|
|
||||||
for i, col in enumerate(styler.data.columns)
|
|
||||||
if col in ratio_cols
|
|
||||||
],
|
|
||||||
overwrite=False,
|
|
||||||
)
|
|
||||||
return styler
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_two_decimals(
|
|
||||||
styler: pd.io.formats.style.Styler,
|
|
||||||
) -> pd.io.formats.style.Styler:
|
|
||||||
df = styler.data
|
|
||||||
num_cols = df.select_dtypes("number").columns
|
|
||||||
if len(num_cols) == 0:
|
|
||||||
return styler
|
|
||||||
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Valid max concurrency summary helpers
|
|
||||||
# -----------------------------
|
|
||||||
def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
|
|
||||||
key_cols = [
|
|
||||||
c
|
|
||||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
|
|
||||||
if c in df.columns
|
|
||||||
]
|
|
||||||
exclude = set(key_cols + [conc_col, "qps", "QPS"])
|
|
||||||
|
|
||||||
cols: list[str] = []
|
|
||||||
for c in df.columns:
|
|
||||||
if c in exclude:
|
|
||||||
continue
|
|
||||||
lc = str(c).lower()
|
|
||||||
if lc.startswith("ratio"):
|
|
||||||
continue
|
|
||||||
if lc.endswith("_name") or lc == "test name" or lc == "test_name":
|
|
||||||
continue
|
|
||||||
if pd.api.types.is_numeric_dtype(df[c]):
|
|
||||||
cols.append(c)
|
|
||||||
return cols
|
|
||||||
|
|
||||||
|
|
||||||
def _max_concurrency_ok(
|
|
||||||
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
|
|
||||||
):
|
|
||||||
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
d = df[[conc_col, cfg_col]].copy()
|
|
||||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
|
||||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
|
||||||
d = d.dropna(subset=[conc_col, cfg_col])
|
|
||||||
|
|
||||||
if d.empty:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
ok = d[d[cfg_col] <= threshold]
|
|
||||||
if ok.empty:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
return ok[conc_col].max()
|
|
||||||
|
|
||||||
|
|
||||||
def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
|
|
||||||
if (
|
|
||||||
df is None
|
|
||||||
or conc_col not in df.columns
|
|
||||||
or cfg_col not in df.columns
|
|
||||||
or pd.isna(conc_value)
|
|
||||||
):
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
d = df[[conc_col, cfg_col]].copy()
|
|
||||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
|
||||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
|
||||||
|
|
||||||
conc_value = pd.to_numeric(conc_value, errors="coerce")
|
|
||||||
if pd.isna(conc_value):
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
hit = d[d[conc_col] == conc_value]
|
|
||||||
if hit.empty:
|
|
||||||
return pd.NA
|
|
||||||
return hit[cfg_col].iloc[0]
|
|
||||||
|
|
||||||
|
|
||||||
def build_valid_max_concurrency_summary_html(
|
|
||||||
tput_group_df: pd.DataFrame | None,
|
|
||||||
ttft_group_df: pd.DataFrame | None,
|
|
||||||
tpot_group_df: pd.DataFrame | None,
|
|
||||||
conc_col: str,
|
|
||||||
args,
|
|
||||||
) -> str:
|
|
||||||
if ttft_group_df is None and tpot_group_df is None:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
ttft_cols = (
|
|
||||||
_config_value_columns(ttft_group_df, conc_col)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
tpot_cols = (
|
|
||||||
_config_value_columns(tpot_group_df, conc_col)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
tput_cols = (
|
|
||||||
_config_value_columns(tput_group_df, conc_col)
|
|
||||||
if tput_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
|
|
||||||
if ttft_group_df is not None and tpot_group_df is not None:
|
|
||||||
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
|
|
||||||
if tput_group_df is not None:
|
|
||||||
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
|
|
||||||
else:
|
|
||||||
cfg_cols = ttft_cols or tpot_cols
|
|
||||||
|
|
||||||
if not cfg_cols:
|
|
||||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
for cfg in cfg_cols:
|
|
||||||
ttft_max = (
|
|
||||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
tpot_max = (
|
|
||||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
both = (
|
|
||||||
pd.NA
|
|
||||||
if (pd.isna(ttft_max) or pd.isna(tpot_max))
|
|
||||||
else min(ttft_max, tpot_max)
|
|
||||||
)
|
|
||||||
|
|
||||||
tput_at_both = (
|
|
||||||
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
|
|
||||||
if tput_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
ttft_at_both = (
|
|
||||||
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
tpot_at_both = (
|
|
||||||
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
|
|
||||||
rows.append(
|
|
||||||
{
|
|
||||||
"Configuration": cfg,
|
|
||||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
|
||||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
|
||||||
f"Max {conc_col} (Both)": both,
|
|
||||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
|
||||||
"TTFT @ Both (ms)": ttft_at_both,
|
|
||||||
"TPOT @ Both (ms)": tpot_at_both,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_df = pd.DataFrame(rows)
|
|
||||||
|
|
||||||
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
|
|
||||||
for c in summary_df.columns:
|
|
||||||
if c == "Configuration":
|
|
||||||
continue
|
|
||||||
summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
|
|
||||||
|
|
||||||
both_col = f"Max {conc_col} (Both)"
|
|
||||||
|
|
||||||
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
|
|
||||||
formatters = {}
|
|
||||||
for c in summary_df.columns:
|
|
||||||
if c == "Configuration":
|
|
||||||
continue
|
|
||||||
# default argument binds per-column formatter correctly
|
|
||||||
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
|
||||||
|
|
||||||
styler = summary_df.style.format(formatters)
|
|
||||||
|
|
||||||
def _green(v):
|
|
||||||
return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
|
|
||||||
|
|
||||||
if both_col in summary_df.columns:
|
|
||||||
styler = styler.map(_green, subset=[both_col])
|
|
||||||
|
|
||||||
title = (
|
|
||||||
'<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
|
|
||||||
"Valid Max Concurrency Summary"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
|
||||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Plot helper
|
|
||||||
# -----------------------------
|
|
||||||
def _add_limit_line(fig, y_value: float, label: str):
|
|
||||||
fig.add_hline(
|
|
||||||
y=y_value,
|
|
||||||
line_dash="dash",
|
|
||||||
line_color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
annotation_text=f"{label}: {y_value} ms",
|
|
||||||
annotation_position="top left",
|
|
||||||
)
|
|
||||||
if plotly_found:
|
|
||||||
import plotly.graph_objects as go
|
|
||||||
|
|
||||||
fig.add_trace(
|
|
||||||
go.Scatter(
|
|
||||||
x=[None],
|
|
||||||
y=[None],
|
|
||||||
mode="lines",
|
|
||||||
line=dict(
|
|
||||||
dash="dash",
|
|
||||||
color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
),
|
|
||||||
name=label,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Refactored main + group-first report
|
|
||||||
# -----------------------------
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class MetricPlan:
|
|
||||||
data_cols: list[str]
|
|
||||||
drop_column: str
|
|
||||||
|
|
||||||
|
|
||||||
def build_parser() -> argparse.ArgumentParser:
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-f", "--file", action="append", type=str, help="input file name"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--debug", action="store_true", help="show all information for debugging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--plot",
|
|
||||||
action=argparse.BooleanOptionalAction,
|
|
||||||
default=True,
|
|
||||||
help="plot perf diagrams or not --no-plot --plot",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-x",
|
|
||||||
"--xaxis",
|
|
||||||
type=str,
|
|
||||||
default="# of max concurrency.",
|
|
||||||
help="column name to use as X Axis in comparison graph",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-l",
|
|
||||||
"--latency",
|
|
||||||
type=str,
|
|
||||||
default="p99",
|
|
||||||
help="take median|p99 for latency like TTFT/TPOT",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--ttft-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=3000.0,
|
|
||||||
help="Reference limit for TTFT plots (ms)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--tpot-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=100.0,
|
|
||||||
help="Reference limit for TPOT plots (ms)",
|
|
||||||
)
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def choose_metrics(latency: str) -> MetricPlan:
|
|
||||||
latency = (latency or "").lower()
|
|
||||||
drop_column = "P99"
|
|
||||||
|
|
||||||
if "median" in latency:
|
|
||||||
return MetricPlan(
|
|
||||||
data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
|
|
||||||
drop_column=drop_column,
|
|
||||||
)
|
|
||||||
|
|
||||||
return MetricPlan(
|
|
||||||
data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
|
|
||||||
drop_column=drop_column,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
|
|
||||||
if not args.file:
|
|
||||||
raise ValueError("No input files provided. Use -f/--file.")
|
|
||||||
|
|
||||||
if len(args.file) == 1:
|
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
|
||||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
|
||||||
else:
|
|
||||||
files = args.file
|
|
||||||
|
|
||||||
return files, info_cols
|
|
||||||
|
|
||||||
|
|
||||||
def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
|
|
||||||
y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
|
|
||||||
return info_cols[y_axis_index]
|
|
||||||
|
|
||||||
|
|
||||||
def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
|
|
||||||
filtered_info_cols = info_cols[:4]
|
|
||||||
group_cols = [c for c in filtered_info_cols if c in output_df.columns]
|
|
||||||
if not group_cols:
|
|
||||||
raise ValueError(
|
|
||||||
f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
|
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
|
||||||
)
|
|
||||||
return group_cols
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_group_key(name):
|
|
||||||
return name if isinstance(name, tuple) else (name,)
|
|
||||||
|
|
||||||
|
|
||||||
def group_filename(name, prefix: str = "perf_comparison_") -> str:
|
|
||||||
name_vals = normalize_group_key(name)
|
|
||||||
safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
|
|
||||||
return f"{prefix}{safe}.html"
|
|
||||||
|
|
||||||
|
|
||||||
def build_group_suffix(group_cols: list[str], name) -> str:
|
|
||||||
name_vals = normalize_group_key(name)
|
|
||||||
return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
|
|
||||||
|
|
||||||
|
|
||||||
def render_metric_table_html(
|
|
||||||
display_group: pd.DataFrame,
|
|
||||||
metric_label: str,
|
|
||||||
group_suffix: str,
|
|
||||||
args,
|
|
||||||
) -> str:
|
|
||||||
title = (
|
|
||||||
f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
|
|
||||||
f"{_html.escape(metric_label)}"
|
|
||||||
f" — {_html.escape(group_suffix)}"
|
|
||||||
f"</div>\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_name = metric_label.lower()
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
styler = _highlight_threshold(display_group, args.ttft_max_ms)
|
|
||||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
|
||||||
styler = _highlight_threshold(display_group, args.tpot_max_ms)
|
|
||||||
else:
|
|
||||||
styler = display_group.style
|
|
||||||
|
|
||||||
styler = _apply_two_decimals(styler)
|
|
||||||
styler = highlight_ratio_columns(styler)
|
|
||||||
|
|
||||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
|
||||||
|
|
||||||
|
|
||||||
def maybe_write_plot(
|
|
||||||
main_fh,
|
|
||||||
sub_fh,
|
|
||||||
group_df: pd.DataFrame,
|
|
||||||
raw_data_cols: list[str],
|
|
||||||
metric_label: str,
|
|
||||||
y_axis_col: str,
|
|
||||||
args,
|
|
||||||
):
|
|
||||||
if not (args.plot and plotly_found):
|
|
||||||
return
|
|
||||||
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
df = group_df[raw_data_cols].sort_values(by=y_axis_col)
|
|
||||||
df_melted = df.melt(
|
|
||||||
id_vars=y_axis_col,
|
|
||||||
var_name="Configuration",
|
|
||||||
value_name=metric_label,
|
|
||||||
)
|
|
||||||
|
|
||||||
fig = px.line(
|
|
||||||
df_melted,
|
|
||||||
x=y_axis_col,
|
|
||||||
y=metric_label,
|
|
||||||
color="Configuration",
|
|
||||||
title=f"{metric_label} vs {y_axis_col}",
|
|
||||||
markers=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure plot hover + y tick labels are also 2 decimals.
|
|
||||||
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
|
||||||
fig.update_yaxes(tickformat=".2f")
|
|
||||||
|
|
||||||
metric_name = metric_label.lower()
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
|
||||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
|
||||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
|
||||||
|
|
||||||
html = fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
main_fh.write(html)
|
|
||||||
sub_fh.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
def build_group_keys(
|
|
||||||
df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
|
|
||||||
):
|
|
||||||
if sort_cols:
|
|
||||||
df = df.sort_values(by=sort_cols)
|
|
||||||
gb = df.groupby(group_cols, dropna=False)
|
|
||||||
return [k for k, _ in gb]
|
|
||||||
|
|
||||||
|
|
||||||
def write_report_group_first(
|
|
||||||
files: list[str], info_cols: list[str], plan: MetricPlan, args
|
|
||||||
):
|
|
||||||
name_column = "Test name"
|
|
||||||
y_axis_col = get_y_axis_col(info_cols, args.xaxis)
|
|
||||||
|
|
||||||
print("comparing : " + ", ".join(files))
|
|
||||||
|
|
||||||
metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
|
|
||||||
group_cols_canonical: list[str] | None = None
|
|
||||||
|
|
||||||
for metric_label in plan.data_cols:
|
|
||||||
output_df, raw_data_cols = compare_data_columns(
|
|
||||||
files,
|
|
||||||
name_column,
|
|
||||||
metric_label,
|
|
||||||
info_cols,
|
|
||||||
plan.drop_column,
|
|
||||||
debug=args.debug,
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_data_cols = list(raw_data_cols)
|
|
||||||
raw_data_cols.insert(0, y_axis_col)
|
|
||||||
|
|
||||||
group_cols = get_group_cols(output_df, info_cols)
|
|
||||||
if group_cols_canonical is None:
|
|
||||||
group_cols_canonical = group_cols
|
|
||||||
else:
|
|
||||||
group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
|
|
||||||
|
|
||||||
metric_cache[metric_label] = (
|
|
||||||
output_df.sort_values(by=args.xaxis),
|
|
||||||
raw_data_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not group_cols_canonical:
|
|
||||||
raise ValueError("No canonical group columns found across metrics.")
|
|
||||||
|
|
||||||
first_metric = plan.data_cols[0]
|
|
||||||
first_df_sorted, _ = metric_cache[first_metric]
|
|
||||||
group_keys = build_group_keys(
|
|
||||||
first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_groupbys = {
|
|
||||||
metric_label: df.groupby(group_cols_canonical, dropna=False)
|
|
||||||
for metric_label, (df, _) in metric_cache.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
|
||||||
main_fh.write('<meta charset="utf-8">\n')
|
|
||||||
for gkey in group_keys:
|
|
||||||
gkey_tuple = normalize_group_key(gkey)
|
|
||||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
|
||||||
sub_path = group_filename(gkey_tuple)
|
|
||||||
group_header = (
|
|
||||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
|
||||||
'margin: 18px 0 10px 0;">'
|
|
||||||
f"{_html.escape(suffix)}"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
main_fh.write(group_header)
|
|
||||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
|
||||||
sub_fh.write('<meta charset="utf-8">\n')
|
|
||||||
sub_fh.write(group_header)
|
|
||||||
tput_group_df = None
|
|
||||||
ttft_group_df = None
|
|
||||||
tpot_group_df = None
|
|
||||||
conc_col = args.xaxis
|
|
||||||
|
|
||||||
for metric_label in plan.data_cols:
|
|
||||||
gb = metric_groupbys[metric_label]
|
|
||||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
|
||||||
|
|
||||||
try:
|
|
||||||
group_df = gb.get_group(gkey)
|
|
||||||
except KeyError:
|
|
||||||
missing = (
|
|
||||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
|
||||||
'margin: 10px 0;">'
|
|
||||||
f"{_html.escape(metric_label)} — missing for this group"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
main_fh.write(missing)
|
|
||||||
sub_fh.write(missing)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if conc_col not in group_df.columns:
|
|
||||||
conc_col = _find_concurrency_col(group_df)
|
|
||||||
|
|
||||||
mn = metric_label.lower().strip()
|
|
||||||
if "tok/s" in mn:
|
|
||||||
tput_group_df = group_df
|
|
||||||
elif "ttft" in mn:
|
|
||||||
ttft_group_df = group_df
|
|
||||||
elif mn in ("p99", "median") or "tpot" in mn:
|
|
||||||
tpot_group_df = group_df
|
|
||||||
|
|
||||||
display_group = group_df.drop(
|
|
||||||
columns=group_cols_canonical, errors="ignore"
|
|
||||||
)
|
|
||||||
|
|
||||||
html = render_metric_table_html(
|
|
||||||
display_group, metric_label, suffix, args
|
|
||||||
)
|
|
||||||
main_fh.write(html)
|
|
||||||
sub_fh.write(html)
|
|
||||||
|
|
||||||
maybe_write_plot(
|
|
||||||
main_fh,
|
|
||||||
sub_fh,
|
|
||||||
group_df=group_df,
|
|
||||||
raw_data_cols=raw_data_cols,
|
|
||||||
metric_label=metric_label,
|
|
||||||
y_axis_col=y_axis_col,
|
|
||||||
args=args,
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_html = build_valid_max_concurrency_summary_html(
|
|
||||||
tput_group_df=tput_group_df,
|
|
||||||
ttft_group_df=ttft_group_df,
|
|
||||||
tpot_group_df=tpot_group_df,
|
|
||||||
conc_col=conc_col,
|
|
||||||
args=args,
|
|
||||||
)
|
|
||||||
if summary_html:
|
|
||||||
main_fh.write(summary_html)
|
|
||||||
sub_fh.write(summary_html)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = build_parser().parse_args()
|
|
||||||
info_cols = list(DEFAULT_INFO_COLS)
|
|
||||||
plan = choose_metrics(args.latency)
|
|
||||||
files, info_cols = prepare_input_files(args, info_cols)
|
|
||||||
write_report_group_first(files, info_cols, plan, args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,414 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import shlex
|
|
||||||
from importlib import util
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import psutil
|
|
||||||
import regex as re
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
# latency results and the keys that will be printed into markdown
|
|
||||||
latency_results = []
|
|
||||||
latency_column_mapping = {
|
|
||||||
"test_name": "Test name",
|
|
||||||
"gpu_type": "GPU",
|
|
||||||
"avg_latency": "Mean latency (ms)",
|
|
||||||
# "P10": "P10 (s)",
|
|
||||||
# "P25": "P25 (s)",
|
|
||||||
"P50": "Median latency (ms)",
|
|
||||||
# "P75": "P75 (s)",
|
|
||||||
# "P90": "P90 (s)",
|
|
||||||
"P99": "P99 latency (ms)",
|
|
||||||
}
|
|
||||||
|
|
||||||
# throughput tests and the keys that will be printed into markdown
|
|
||||||
throughput_results = []
|
|
||||||
throughput_results_column_mapping = {
|
|
||||||
"test_name": "Test name",
|
|
||||||
"gpu_type": "GPU",
|
|
||||||
"num_requests": "# of req.",
|
|
||||||
"total_num_tokens": "Total # of tokens",
|
|
||||||
"elapsed_time": "Elapsed time (s)",
|
|
||||||
"requests_per_second": "Tput (req/s)",
|
|
||||||
"tokens_per_second": "Tput (tok/s)",
|
|
||||||
}
|
|
||||||
|
|
||||||
# serving results and the keys that will be printed into markdown
|
|
||||||
serving_results = []
|
|
||||||
serving_column_mapping = {
|
|
||||||
"test_name": "Test name",
|
|
||||||
"model_id": "Model",
|
|
||||||
"dataset_name": "Dataset Name",
|
|
||||||
"input_len": "Input Len",
|
|
||||||
"output_len": "Output Len",
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"dtype": "dtype",
|
|
||||||
"gpu_type": "GPU",
|
|
||||||
"completed": "# of req.",
|
|
||||||
"qps": "qps",
|
|
||||||
"max_concurrency": "# of max concurrency.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
|
||||||
# "total_input_tokens": "Total input tokens",
|
|
||||||
# "total_output_tokens": "Total output tokens",
|
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
|
||||||
"std_ttft_ms": "STD TTFT (ms)",
|
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
|
||||||
"median_tpot_ms": "Median",
|
|
||||||
"p99_tpot_ms": "P99",
|
|
||||||
"std_tpot_ms": "STD TPOT (ms)",
|
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def read_markdown(file):
|
|
||||||
if os.path.exists(file):
|
|
||||||
with open(file) as f:
|
|
||||||
return f.read() + "\n"
|
|
||||||
else:
|
|
||||||
return f"{file} not found.\n"
|
|
||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
|
||||||
return json.dumps(
|
|
||||||
{
|
|
||||||
"latency": latency.to_dict(),
|
|
||||||
"throughput": throughput.to_dict(),
|
|
||||||
"serving": serving.to_dict(),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_size_with_unit(bytes, suffix="B"):
|
|
||||||
"""
|
|
||||||
Scale bytes to its proper format
|
|
||||||
e.g:
|
|
||||||
1253656 => '1.20MB'
|
|
||||||
1253656678 => '1.17GB'
|
|
||||||
"""
|
|
||||||
factor = 1024
|
|
||||||
for unit in ["", "K", "M", "G", "T", "P"]:
|
|
||||||
if bytes < factor:
|
|
||||||
return f"{bytes:.2f}{unit}{suffix}"
|
|
||||||
bytes /= factor
|
|
||||||
|
|
||||||
|
|
||||||
def _coerce(val: str) -> Any:
|
|
||||||
"""Best-effort type coercion from string to Python types."""
|
|
||||||
low = val.lower()
|
|
||||||
if low == "null":
|
|
||||||
return None
|
|
||||||
if low == "true":
|
|
||||||
return True
|
|
||||||
if low == "false":
|
|
||||||
return False
|
|
||||||
# integers
|
|
||||||
if re.fullmatch(r"[+-]?\d+", val):
|
|
||||||
try:
|
|
||||||
return int(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
|
||||||
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
|
||||||
try:
|
|
||||||
return float(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
def parse_client_command(cmd: str) -> dict[str, Any]:
|
|
||||||
"""Parse the client_command shell string into {executable, script, args}."""
|
|
||||||
toks = shlex.split(cmd)
|
|
||||||
if len(toks) < 2:
|
|
||||||
raise ValueError("client_command must include an executable and a script")
|
|
||||||
executable, script = toks[0], toks[1]
|
|
||||||
args: dict[str, Any] = {}
|
|
||||||
|
|
||||||
i = 2
|
|
||||||
while i < len(toks):
|
|
||||||
t = toks[i]
|
|
||||||
if t.startswith("--"):
|
|
||||||
# --key=value or --key (value) or boolean flag
|
|
||||||
if "=" in t:
|
|
||||||
key, val = t.split("=", 1)
|
|
||||||
if key == "--metadata":
|
|
||||||
md = {}
|
|
||||||
if val:
|
|
||||||
if "=" in val:
|
|
||||||
k, v = val.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[val] = True
|
|
||||||
args[key] = md
|
|
||||||
else:
|
|
||||||
args[key] = _coerce(val)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
key = t
|
|
||||||
|
|
||||||
# Special: consume metadata k=v pairs until next --flag
|
|
||||||
if key == "--metadata":
|
|
||||||
i += 1
|
|
||||||
md = {}
|
|
||||||
while i < len(toks) and not toks[i].startswith("--"):
|
|
||||||
pair = toks[i]
|
|
||||||
if "=" in pair:
|
|
||||||
k, v = pair.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[pair] = True
|
|
||||||
i += 1
|
|
||||||
args[key] = md
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Standard: check if next token is a value (not a flag)
|
|
||||||
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
|
||||||
args[key] = _coerce(toks[i + 1])
|
|
||||||
i += 2
|
|
||||||
else:
|
|
||||||
# lone flag -> True
|
|
||||||
args[key] = True
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
# unexpected positional; skip
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return {"executable": executable, "script": script, "args": args}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-r",
|
|
||||||
"--result",
|
|
||||||
type=str,
|
|
||||||
default="results",
|
|
||||||
help="Folder name for benchmark output results.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
results_folder = Path(args.result)
|
|
||||||
if not results_folder.exists():
|
|
||||||
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*.json"):
|
|
||||||
with open(test_file) as f:
|
|
||||||
raw_result = json.loads(f.read())
|
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
|
||||||
# this result is generated via `vllm bench serve` command
|
|
||||||
# attach the benchmarking command to raw_result
|
|
||||||
try:
|
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
# Parse Server Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"server_command": parse_client_command(command["server_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--tensor-parallel-size",
|
|
||||||
"--pipeline-parallel-size",
|
|
||||||
"--dtype",
|
|
||||||
]
|
|
||||||
col_mapping = ["tp_size", "pp_size", "dtype"]
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["server_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["server_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse Client Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"client_command": parse_client_command(command["client_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--dataset-name",
|
|
||||||
"--random-input-len",
|
|
||||||
"--random-output-len",
|
|
||||||
"--request-rate",
|
|
||||||
]
|
|
||||||
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
|
||||||
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["client_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["client_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
# Add Server, Client command
|
|
||||||
raw_result.update(command)
|
|
||||||
|
|
||||||
# update the test name of this result
|
|
||||||
raw_result.update({"test_name": test_file.stem})
|
|
||||||
# add the result to raw_result
|
|
||||||
serving_results.append(raw_result)
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif "latency" in f.name:
|
|
||||||
# this result is generated via `vllm bench latency` command
|
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
|
||||||
try:
|
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
|
||||||
|
|
||||||
# update the test name of this result
|
|
||||||
raw_result.update({"test_name": test_file.stem})
|
|
||||||
|
|
||||||
# get different percentiles
|
|
||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
|
||||||
raw_result.update(
|
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
|
||||||
)
|
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
|
||||||
|
|
||||||
# add the result to raw_result
|
|
||||||
latency_results.append(raw_result)
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
|
||||||
# this result is generated via `vllm bench throughput` command
|
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
|
||||||
try:
|
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
|
||||||
|
|
||||||
# update the test name of this result
|
|
||||||
raw_result.update({"test_name": test_file.stem})
|
|
||||||
|
|
||||||
# add the result to raw_result
|
|
||||||
throughput_results.append(raw_result)
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"Skipping {test_file}")
|
|
||||||
|
|
||||||
latency_results = pd.DataFrame.from_dict(latency_results)
|
|
||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
|
||||||
|
|
||||||
svmem = psutil.virtual_memory()
|
|
||||||
platform_data = {
|
|
||||||
"Physical cores": [psutil.cpu_count(logical=False)],
|
|
||||||
"Total cores": [psutil.cpu_count(logical=True)],
|
|
||||||
"Total Memory": [get_size_with_unit(svmem.total)],
|
|
||||||
}
|
|
||||||
|
|
||||||
if util.find_spec("numa") is not None:
|
|
||||||
from numa import info
|
|
||||||
|
|
||||||
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
|
||||||
|
|
||||||
if util.find_spec("cpuinfo") is not None:
|
|
||||||
from cpuinfo import get_cpu_info
|
|
||||||
|
|
||||||
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
|
||||||
|
|
||||||
platform_results = pd.DataFrame.from_dict(
|
|
||||||
platform_data, orient="index", columns=["Platform Info"]
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_results_json = results_to_json(
|
|
||||||
latency_results, throughput_results, serving_results
|
|
||||||
)
|
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
|
||||||
if not latency_results.empty:
|
|
||||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
|
||||||
columns=latency_column_mapping
|
|
||||||
)
|
|
||||||
if not serving_results.empty:
|
|
||||||
valid_columns = [
|
|
||||||
col for col in serving_column_mapping if col in serving_results.columns
|
|
||||||
]
|
|
||||||
serving_results = serving_results[valid_columns].rename(
|
|
||||||
columns=serving_column_mapping
|
|
||||||
)
|
|
||||||
if not throughput_results.empty:
|
|
||||||
throughput_results = throughput_results[
|
|
||||||
list(throughput_results_column_mapping.keys())
|
|
||||||
].rename(columns=throughput_results_column_mapping)
|
|
||||||
|
|
||||||
processed_results_json = results_to_json(
|
|
||||||
latency_results, throughput_results, serving_results
|
|
||||||
)
|
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
|
||||||
if df.empty:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Sort all dataframes by their respective "Test name" columns
|
|
||||||
df.sort_values(by="Test name", inplace=True)
|
|
||||||
|
|
||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
|
||||||
# we want to turn it into "8xGPUTYPE"
|
|
||||||
df["GPU"] = df["GPU"].apply(
|
|
||||||
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
|
||||||
)
|
|
||||||
|
|
||||||
# get markdown tables
|
|
||||||
latency_md_table = tabulate(
|
|
||||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
|
||||||
)
|
|
||||||
serving_md_table = tabulate(
|
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
|
||||||
)
|
|
||||||
throughput_md_table = tabulate(
|
|
||||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
|
||||||
)
|
|
||||||
platform_md_table = tabulate(
|
|
||||||
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# document the result
|
|
||||||
md_file = "benchmark_results.md"
|
|
||||||
json_file = "benchmark_results.json"
|
|
||||||
with open(results_folder / md_file, "w") as f:
|
|
||||||
results = read_markdown(
|
|
||||||
"../.buildkite/performance-benchmarks/"
|
|
||||||
"performance-benchmarks-descriptions.md"
|
|
||||||
)
|
|
||||||
results = results.format(
|
|
||||||
latency_tests_markdown_table=latency_md_table,
|
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
|
||||||
serving_tests_markdown_table=serving_md_table,
|
|
||||||
platform_markdown_table=platform_md_table,
|
|
||||||
benchmarking_results_in_json_string=processed_results_json,
|
|
||||||
)
|
|
||||||
f.write(results)
|
|
||||||
|
|
||||||
# document benchmarking results in json
|
|
||||||
with open(results_folder / json_file, "w") as f:
|
|
||||||
results = (
|
|
||||||
latency_results.to_dict(orient="records")
|
|
||||||
+ throughput_results.to_dict(orient="records")
|
|
||||||
+ serving_results.to_dict(orient="records")
|
|
||||||
)
|
|
||||||
f.write(json.dumps(results))
|
|
||||||
@@ -1,224 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Currently FP8 benchmark is NOT enabled.
|
|
||||||
|
|
||||||
set -x
|
|
||||||
server_params=$1
|
|
||||||
common_params=$2
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_trt_server() {
|
|
||||||
|
|
||||||
model_path=$(echo "$common_params" | jq -r '.model')
|
|
||||||
model_name="${model_path#*/}"
|
|
||||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
|
||||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
|
||||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
|
||||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
|
||||||
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
|
|
||||||
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
|
|
||||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
|
||||||
|
|
||||||
# create model caching directory
|
|
||||||
cd ~
|
|
||||||
rm -rf models
|
|
||||||
mkdir -p models
|
|
||||||
cd models
|
|
||||||
models_dir=$(pwd)
|
|
||||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
|
||||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
|
||||||
|
|
||||||
# clone tensorrt backend
|
|
||||||
cd /
|
|
||||||
rm -rf tensorrtllm_backend
|
|
||||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
|
||||||
git lfs install
|
|
||||||
cd tensorrtllm_backend
|
|
||||||
git checkout "$trt_llm_version"
|
|
||||||
git submodule update --init --recursive
|
|
||||||
|
|
||||||
# build trtllm engine
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
cd "./tensorrt_llm/examples/${model_type}"
|
|
||||||
python3 convert_checkpoint.py \
|
|
||||||
--model_dir "${model_path}" \
|
|
||||||
--dtype "${model_dtype}" \
|
|
||||||
--tp_size "${model_tp_size}" \
|
|
||||||
--output_dir "${trt_model_path}"
|
|
||||||
trtllm-build \
|
|
||||||
--checkpoint_dir "${trt_model_path}" \
|
|
||||||
--use_fused_mlp \
|
|
||||||
--reduce_fusion disable \
|
|
||||||
--workers 8 \
|
|
||||||
--gpt_attention_plugin "${model_dtype}" \
|
|
||||||
--gemm_plugin "${model_dtype}" \
|
|
||||||
--tp_size "${model_tp_size}" \
|
|
||||||
--max_batch_size "${max_batch_size}" \
|
|
||||||
--max_input_len "${max_input_len}" \
|
|
||||||
--max_seq_len "${max_seq_len}" \
|
|
||||||
--max_num_tokens "${max_num_tokens}" \
|
|
||||||
--output_dir "${trt_engine_path}"
|
|
||||||
|
|
||||||
# handle triton protobuf files and launch triton server
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
mkdir triton_model_repo
|
|
||||||
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
|
||||||
cd triton_model_repo
|
|
||||||
rm -rf ./tensorrt_llm/1/*
|
|
||||||
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
|
|
||||||
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
|
||||||
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
|
|
||||||
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
|
|
||||||
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
|
|
||||||
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
python3 scripts/launch_triton_server.py \
|
|
||||||
--world_size="${model_tp_size}" \
|
|
||||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_tgi_server() {
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
--quantize fp8 \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_lmdeploy_server() {
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
|
|
||||||
server_command="lmdeploy serve api_server $model \
|
|
||||||
--tp $tp \
|
|
||||||
--server-port $port \
|
|
||||||
$server_args"
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
bash -c "$server_command" &
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_sglang_server() {
|
|
||||||
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
|
||||||
server_command="python3 \
|
|
||||||
-m sglang.launch_server \
|
|
||||||
--tp $tp \
|
|
||||||
--model-path $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="python3 \
|
|
||||||
-m sglang.launch_server \
|
|
||||||
--tp $tp \
|
|
||||||
--model-path $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_vllm_server() {
|
|
||||||
|
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
|
||||||
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
|
||||||
server_command="vllm serve $model \
|
|
||||||
-tp $tp \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="vllm serve $model \
|
|
||||||
-tp $tp \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
|
|
||||||
launch_trt_server
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
|
|
||||||
launch_tgi_server
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
|
||||||
launch_lmdeploy_server
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
|
|
||||||
launch_sglang_server
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
|
|
||||||
launch_vllm_server
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
main
|
|
||||||
@@ -1,493 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script should be run inside the CI process
|
|
||||||
# This script assumes that we are already inside the vllm/ directory
|
|
||||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
|
||||||
|
|
||||||
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
|
||||||
# and we still want to see other benchmarking results even when mixtral crashes.
|
|
||||||
set -x
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
if command -v nvidia-smi; then
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
declare -g arch_suffix=''
|
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
|
||||||
arch_suffix='-hpu'
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_cpus() {
|
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
|
||||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
|
||||||
echo "NUMA found."
|
|
||||||
echo $numa_count
|
|
||||||
else
|
|
||||||
echo "Need at least 1 NUMA to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
|
|
||||||
declare -g gpu_type="arm64-cpu"
|
|
||||||
else
|
|
||||||
declare -g gpu_type="cpu"
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_hf_token() {
|
|
||||||
# check if HF_TOKEN is available and valid
|
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
|
||||||
echo "Error: HF_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
||||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "HF_TOKEN is set and valid."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_sharegpt_downloaded() {
|
|
||||||
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
if [ ! -f "$FILE" ]; then
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
|
|
||||||
else
|
|
||||||
echo "$FILE already exists."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2envs() {
|
|
||||||
# transforms the JSON string to environment variables.
|
|
||||||
# example:
|
|
||||||
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
|
|
||||||
# output: VLLM_CPU_KVCACHE_SPACE=5
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map((.key ) + "=" + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
local timeout_val="1200"
|
|
||||||
timeout "$timeout_val" bash -c '
|
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_processes_launched_by_current_bash() {
|
|
||||||
# Kill all python processes launched from current bash script
|
|
||||||
current_shell_pid=$$
|
|
||||||
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
|
|
||||||
if [ -n "$processes" ]; then
|
|
||||||
echo "Killing the following processes matching '$1':"
|
|
||||||
echo "$processes"
|
|
||||||
echo "$processes" | xargs kill -9
|
|
||||||
else
|
|
||||||
echo "No processes found matching '$1'."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
|
|
||||||
ps -aux
|
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
|
||||||
pgrep python3 | xargs -r kill -9
|
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pgrep VLLM | xargs -r kill -9
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
|
||||||
if command -v nvidia-smi; then
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
elif command -v hl-smi; then
|
|
||||||
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
|
||||||
rm -rf ~/.config/vllm
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
|
|
||||||
if command -v buildkite-agent >/dev/null 2>&1; then
|
|
||||||
BUILDKITE_AGENT_COMMAND="buildkite-agent"
|
|
||||||
elif [ -f /workspace/buildkite-agent ]; then
|
|
||||||
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
|
|
||||||
else
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Use the determined command to annotate and upload artifacts
|
|
||||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
|
|
||||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
run_benchmark_tests() {
|
|
||||||
# run benchmark tests using `vllm bench <test_type>` command
|
|
||||||
# $1: test type (latency or throughput)
|
|
||||||
# $2: a json file specifying test cases
|
|
||||||
|
|
||||||
local test_type=$1
|
|
||||||
local test_file=$2
|
|
||||||
|
|
||||||
# Iterate over tests
|
|
||||||
jq -c '.[]' "$test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
|
|
||||||
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get arguments
|
|
||||||
bench_params=$(echo "$params" | jq -r '.parameters')
|
|
||||||
bench_args=$(json2args "$bench_params")
|
|
||||||
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
|
||||||
bench_envs=$(json2envs "$bench_environment_variables")
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
|
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
|
||||||
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
|
|
||||||
world_size=$(($tp*$pp))
|
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
bench_command=" $bench_envs vllm bench $test_type \
|
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
|
||||||
$bench_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "${test_type^} command: $bench_command"
|
|
||||||
|
|
||||||
# recording benchmarking command and GPU command
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg command "$bench_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg test_type "$test_type" \
|
|
||||||
'{
|
|
||||||
($test_type + "_command"): $command,
|
|
||||||
gpu_type: $gpu
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
|
||||||
|
|
||||||
# run the benchmark
|
|
||||||
eval "$bench_command"
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
run_latency_tests() {
|
|
||||||
run_benchmark_tests "latency" "$1"
|
|
||||||
}
|
|
||||||
|
|
||||||
run_startup_tests() {
|
|
||||||
run_benchmark_tests "startup" "$1"
|
|
||||||
}
|
|
||||||
|
|
||||||
run_throughput_tests() {
|
|
||||||
run_benchmark_tests "throughput" "$1"
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `vllm bench serve` command
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
#
|
|
||||||
# Supported JSON formats:
|
|
||||||
# 1) Plain format: top-level array
|
|
||||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
#
|
|
||||||
# 2) Default parameters field + plain format tests
|
|
||||||
# {
|
|
||||||
# "defaults": { ... },
|
|
||||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
# }
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '
|
|
||||||
if type == "array" then
|
|
||||||
# Plain format: test cases array
|
|
||||||
.[]
|
|
||||||
elif (type == "object" and has("tests")) then
|
|
||||||
# merge the default parameters into each test cases
|
|
||||||
. as $root
|
|
||||||
| ($root.defaults // {}) as $d
|
|
||||||
| ($root.tests // [])[]
|
|
||||||
# default qps / max_concurrency from defaults if missing
|
|
||||||
| .qps_list = (.qps_list // $d.qps_list)
|
|
||||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
|
||||||
# merge envs / params: test overrides defaults
|
|
||||||
| .server_environment_variables =
|
|
||||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
|
||||||
| .server_parameters =
|
|
||||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
|
||||||
| .client_parameters =
|
|
||||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
|
||||||
else
|
|
||||||
error("Unsupported serving test file format: must be array or object with .tests")
|
|
||||||
end
|
|
||||||
' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
|
||||||
echo "In serving-test.json, test_name must start with \"serving_\"."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get client and server arguments (after merged the default parameters)
|
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
server_envs=$(json2envs "$server_envs")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
|
|
||||||
# qps_list
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# max_concurrency_list (fallback to num_prompts if missing)
|
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
|
||||||
max_concurrency_list="[$num_prompts]"
|
|
||||||
fi
|
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
|
||||||
|
|
||||||
# check if there is enough resources to run the test
|
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
|
||||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
|
|
||||||
world_size=$(($tp*$pp))
|
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if server model and client model is aligned
|
|
||||||
server_model=$(echo "$server_params" | jq -r '.model')
|
|
||||||
client_model=$(echo "$client_params" | jq -r '.model')
|
|
||||||
if [[ $server_model != "$client_model" ]]; then
|
|
||||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
|
||||||
$server_args"
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
# support remote vllm server
|
|
||||||
client_remote_args=""
|
|
||||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
|
||||||
bash -c "$server_command" &
|
|
||||||
server_pid=$!
|
|
||||||
# wait until the server is alive
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "vLLM server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "vLLM failed to start within the timeout period."
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
|
||||||
if [[ ${REMOTE_PORT} ]]; then
|
|
||||||
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
|
|
||||||
else
|
|
||||||
client_remote_args=" --host=$REMOTE_HOST "
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different max_concurrency
|
|
||||||
for max_concurrency in $max_concurrency_list; do
|
|
||||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
|
||||||
echo " new test name $new_test_name"
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--max-concurrency $max_concurrency \
|
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args $client_remote_args "
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
bash -c "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill -9 $server_pid
|
|
||||||
kill_gpu_processes
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
local ARCH
|
|
||||||
ARCH=''
|
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
|
||||||
check_cpus
|
|
||||||
ARCH="-$gpu_type"
|
|
||||||
else
|
|
||||||
check_gpus
|
|
||||||
ARCH="$arch_suffix"
|
|
||||||
fi
|
|
||||||
check_hf_token
|
|
||||||
|
|
||||||
# dependencies
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
|
||||||
|
|
||||||
# get the current IP address, required by `vllm bench serve` command
|
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
|
||||||
|
|
||||||
# prepare for benchmarking
|
|
||||||
cd benchmarks || exit 1
|
|
||||||
ensure_sharegpt_downloaded
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
|
||||||
|
|
||||||
# dump vllm info via vllm collect-env
|
|
||||||
env_output=$(vllm collect-env)
|
|
||||||
|
|
||||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
|
||||||
|
|
||||||
# benchmarking
|
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
|
||||||
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
|
||||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
|
||||||
|
|
||||||
# postprocess benchmarking results
|
|
||||||
pip install tabulate pandas
|
|
||||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
|
||||||
|
|
||||||
upload_to_buildkite
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "llama8B_tp1_genai_perf",
|
|
||||||
"qps_list": [4,8,16,32],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
||||||
"tp": 1,
|
|
||||||
"port": 8000,
|
|
||||||
"num_prompts": 500,
|
|
||||||
"reuse_server": false
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"genai_perf_input_parameters": {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama70B_tp4",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_mixtral8x7B_tp2",
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,311 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
||||||
"tp": 1,
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"reuse_server": false
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"enable_torch_compile": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "llama8B_tp1_sonnet_512_16",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
||||||
"tp": 1,
|
|
||||||
"dataset_name": "sonnet",
|
|
||||||
"dataset_path": "./sonnet_4x.txt",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"sonnet_input_len": 512,
|
|
||||||
"sonnet_output_len": 16,
|
|
||||||
"sonnet_prefix_len": 50,
|
|
||||||
"reuse_server": true
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"enable_torch_compile": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "llama8B_tp1_sonnet_512_256",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
||||||
"tp": 1,
|
|
||||||
"dataset_name": "sonnet",
|
|
||||||
"dataset_path": "./sonnet_4x.txt",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"sonnet_input_len": 512,
|
|
||||||
"sonnet_output_len": 256,
|
|
||||||
"sonnet_prefix_len": 50,
|
|
||||||
"reuse_server": true
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"enable_torch_compile": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "llama70B_tp4_sharegpt",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
|
||||||
"tp": 4,
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"reuse_server": false
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "llama70B_tp4_sonnet_512_16",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
|
||||||
"tp": 4,
|
|
||||||
"dataset_name": "sonnet",
|
|
||||||
"dataset_path": "./sonnet_4x.txt",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"sonnet_input_len": 512,
|
|
||||||
"sonnet_output_len": 16,
|
|
||||||
"sonnet_prefix_len": 50,
|
|
||||||
"reuse_server": true
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "llama70B_tp4_sonnet_512_256",
|
|
||||||
"qps_list": [4,8,16,32,"inf"],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
|
||||||
"tp": 4,
|
|
||||||
"dataset_name": "sonnet",
|
|
||||||
"dataset_path": "./sonnet_4x.txt",
|
|
||||||
"num_prompts": 500,
|
|
||||||
"port": 8000,
|
|
||||||
"sonnet_input_len": 512,
|
|
||||||
"sonnet_output_len": 256,
|
|
||||||
"sonnet_prefix_len": 50,
|
|
||||||
"reuse_server": true
|
|
||||||
},
|
|
||||||
"lmdeploy_server_parameters": {
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"lmdeploy_client_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_server_parameters": {
|
|
||||||
},
|
|
||||||
"tgi_client_parameters": {
|
|
||||||
"endpoint": "/generate_stream"
|
|
||||||
},
|
|
||||||
"trt_server_parameters": {
|
|
||||||
"model_type": "llama",
|
|
||||||
"model_dtype": "bfloat16",
|
|
||||||
"max_batch_size": 2048,
|
|
||||||
"max_input_len": 4096,
|
|
||||||
"max_seq_len": 6144,
|
|
||||||
"max_num_tokens": 16384,
|
|
||||||
"trt_llm_version": "v0.11.0"
|
|
||||||
},
|
|
||||||
"trt_client_parameters": {
|
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"vllm_client_parameters": {
|
|
||||||
},
|
|
||||||
"sglang_server_parameters": {
|
|
||||||
"disable_radix_cache": "",
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"sglang_client_parameters": {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"max_concurrency_list": [
|
|
||||||
12,
|
|
||||||
16,
|
|
||||||
24,
|
|
||||||
32,
|
|
||||||
64,
|
|
||||||
128,
|
|
||||||
200
|
|
||||||
],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,283 +0,0 @@
|
|||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
|
||||||
"qps_list": [2],
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"swap_space": 16,
|
|
||||||
"speculative_config": {
|
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
|
||||||
"num_speculative_tokens": 4,
|
|
||||||
"draft_tensor_parallel_size": 1
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama70B_tp4",
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_mixtral8x7B_tp2",
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,713 +0,0 @@
|
|||||||
steps:
|
|
||||||
- input: "Provide Release version here"
|
|
||||||
id: input-release-version
|
|
||||||
fields:
|
|
||||||
- text: "What is the release version?"
|
|
||||||
key: release-version
|
|
||||||
|
|
||||||
- group: "Build Python wheels"
|
|
||||||
key: "build-wheels"
|
|
||||||
steps:
|
|
||||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cuda-12-9
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - aarch64 - CPU"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cpu
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-x86-cuda-12-9
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-x86-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - x86_64 - CPU"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-x86-cpu
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- group: "Build release Docker images"
|
|
||||||
key: "build-release-images"
|
|
||||||
steps:
|
|
||||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-x86
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-arm64
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
|
|
||||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-x86-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
|
|
||||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-arm64-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
|
||||||
|
|
||||||
- block: "Build release image for x86_64 CPU"
|
|
||||||
key: block-cpu-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build release image - x86_64 - CPU"
|
|
||||||
depends_on:
|
|
||||||
- block-cpu-release-image-build
|
|
||||||
- input-release-version
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- block: "Build release image for arm64 CPU"
|
|
||||||
key: block-arm64-cpu-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build release image - arm64 - CPU"
|
|
||||||
depends_on:
|
|
||||||
- block-arm64-cpu-release-image-build
|
|
||||||
- input-release-version
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- group: "Publish release images"
|
|
||||||
key: "publish-release-images"
|
|
||||||
steps:
|
|
||||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86
|
|
||||||
- build-release-image-arm64
|
|
||||||
id: create-multi-arch-manifest
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
- label: "Annotate release workflow - CUDA 12.9"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest
|
|
||||||
id: annotate-release-workflow
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
|
||||||
|
|
||||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86-cuda-13-0
|
|
||||||
- build-release-image-arm64-cuda-13-0
|
|
||||||
id: create-multi-arch-manifest-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
|
|
||||||
- label: "Publish nightly multi-arch image to DockerHub"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
|
||||||
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest-cuda-13-0
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
|
||||||
- group: "Publish wheels"
|
|
||||||
key: "publish-wheels"
|
|
||||||
steps:
|
|
||||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
|
||||||
key: block-upload-release-wheels
|
|
||||||
depends_on:
|
|
||||||
- input-release-version
|
|
||||||
- build-wheels
|
|
||||||
|
|
||||||
- label: "Upload release wheels to PyPI"
|
|
||||||
depends_on:
|
|
||||||
- block-upload-release-wheels
|
|
||||||
id: upload-release-wheels
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# ROCm Release Pipeline (x86_64 only)
|
|
||||||
# =============================================================================
|
|
||||||
#
|
|
||||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
|
||||||
# To build a specific version, trigger the build from that branch/tag.
|
|
||||||
#
|
|
||||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
|
||||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
|
||||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
|
||||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
|
||||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
|
||||||
#
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
|
||||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
|
||||||
#
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
|
||||||
- input: "ROCm Wheel Release Build Configuration"
|
|
||||||
key: input-rocm-config
|
|
||||||
depends_on: ~
|
|
||||||
if: build.source == "ui"
|
|
||||||
fields:
|
|
||||||
- text: "Python Version"
|
|
||||||
key: "rocm-python-version"
|
|
||||||
default: "3.12"
|
|
||||||
hint: "Python version (e.g., 3.12)"
|
|
||||||
- text: "GPU Architectures"
|
|
||||||
key: "rocm-pytorch-rocm-arch"
|
|
||||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
|
||||||
hint: "Semicolon-separated GPU architectures"
|
|
||||||
- select: "Upload Wheels to S3"
|
|
||||||
key: "rocm-upload-wheels"
|
|
||||||
default: "true"
|
|
||||||
options:
|
|
||||||
- label: "No - Build only (nightly/dev)"
|
|
||||||
value: "false"
|
|
||||||
- label: "Yes - Upload to S3 (release)"
|
|
||||||
value: "true"
|
|
||||||
- select: "Force Rebuild Base Wheels"
|
|
||||||
key: "rocm-force-rebuild"
|
|
||||||
default: "false"
|
|
||||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
|
||||||
options:
|
|
||||||
- label: "No - Use cached wheels if available"
|
|
||||||
value: "false"
|
|
||||||
- label: "Yes - Rebuild even if cache exists"
|
|
||||||
value: "true"
|
|
||||||
|
|
||||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
|
||||||
- label: ":rocm: Build ROCm Base Wheels"
|
|
||||||
id: build-rocm-base-wheels
|
|
||||||
depends_on:
|
|
||||||
- step: input-rocm-config
|
|
||||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# Set configuration and check cache
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Get values from meta-data (set by input step) or use defaults
|
|
||||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
|
||||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
|
||||||
|
|
||||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
|
||||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
|
||||||
|
|
||||||
# Check for force rebuild flag
|
|
||||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
|
||||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
|
||||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "ROCm Base Wheels Build Configuration"
|
|
||||||
echo "========================================"
|
|
||||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
|
||||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
|
||||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Save resolved config for later jobs
|
|
||||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
|
||||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
|
||||||
|
|
||||||
# Check S3 cache for pre-built wheels
|
|
||||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
|
||||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
|
||||||
echo ""
|
|
||||||
echo "Cache key: $${CACHE_KEY}"
|
|
||||||
echo "Cache path: $${CACHE_PATH}"
|
|
||||||
|
|
||||||
# Save cache key for downstream jobs
|
|
||||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
|
||||||
|
|
||||||
CACHE_STATUS="miss"
|
|
||||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
|
||||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
|
||||||
else
|
|
||||||
echo "Force rebuild requested, skipping cache check"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
|
||||||
echo ""
|
|
||||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
|
||||||
echo ""
|
|
||||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
|
||||||
|
|
||||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
|
||||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
|
||||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Mark that we used cache (for Docker image handling)
|
|
||||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Cache download complete. Skipping Docker build."
|
|
||||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "CACHE MISS. Building from scratch..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Build full base image (for later vLLM build)
|
|
||||||
DOCKER_BUILDKIT=1 docker buildx build \
|
|
||||||
--file docker/Dockerfile.rocm_base \
|
|
||||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
|
||||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
--load \
|
|
||||||
.
|
|
||||||
|
|
||||||
# Build debs_wheel_release stage for wheel extraction
|
|
||||||
DOCKER_BUILDKIT=1 docker buildx build \
|
|
||||||
--file docker/Dockerfile.rocm_base \
|
|
||||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
|
||||||
--target debs_wheel_release \
|
|
||||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
--load \
|
|
||||||
.
|
|
||||||
|
|
||||||
# Extract wheels from Docker image
|
|
||||||
mkdir -p artifacts/rocm-base-wheels
|
|
||||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
|
||||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
|
||||||
docker rm $${container_id}
|
|
||||||
echo "Extracted base wheels:"
|
|
||||||
ls -lh artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
# Upload wheels to S3 cache for future builds
|
|
||||||
echo ""
|
|
||||||
echo "Uploading wheels to S3 cache..."
|
|
||||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
|
||||||
|
|
||||||
# Export base Docker image for reuse in vLLM build
|
|
||||||
mkdir -p artifacts/rocm-docker-image
|
|
||||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
|
||||||
echo "Docker image size:"
|
|
||||||
ls -lh artifacts/rocm-docker-image/
|
|
||||||
|
|
||||||
# Upload large Docker image to S3 (also cached by cache key)
|
|
||||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
|
||||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
|
||||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Save the S3 path for downstream jobs
|
|
||||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Mark that we did NOT use cache
|
|
||||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Build complete. Wheels cached for future builds."
|
|
||||||
fi
|
|
||||||
artifact_paths:
|
|
||||||
- "artifacts/rocm-base-wheels/*.whl"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
|
||||||
- label: ":python: Build vLLM ROCm Wheel - x86_64"
|
|
||||||
id: build-rocm-vllm-wheel
|
|
||||||
depends_on:
|
|
||||||
- step: build-rocm-base-wheels
|
|
||||||
allow_failure: false
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
timeout_in_minutes: 180
|
|
||||||
commands:
|
|
||||||
# Download artifacts and prepare Docker image
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
|
||||||
# This fixes version detection when tags are moved/force-pushed
|
|
||||||
echo "Fetching latest tags from origin..."
|
|
||||||
git fetch --tags --force origin
|
|
||||||
|
|
||||||
# Log tag information for debugging version detection
|
|
||||||
echo "========================================"
|
|
||||||
echo "Git Tag Verification"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
|
||||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
|
||||||
echo ""
|
|
||||||
echo "Recent tags (pointing to commits near HEAD):"
|
|
||||||
git tag -l --sort=-creatordate | head -5
|
|
||||||
echo "setuptools_scm version detection:"
|
|
||||||
pip install -q setuptools_scm 2>/dev/null || true
|
|
||||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Download wheel artifacts from current build
|
|
||||||
echo "Downloading wheel artifacts from current build"
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
|
||||||
|
|
||||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
|
||||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
|
||||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
|
||||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
|
||||||
echo "This should have been set by the build-rocm-base-wheels job"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
|
||||||
mkdir -p artifacts/rocm-docker-image
|
|
||||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
|
||||||
|
|
||||||
# Load base Docker image and capture the tag
|
|
||||||
echo "Loading base Docker image..."
|
|
||||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
|
||||||
echo "$${LOAD_OUTPUT}"
|
|
||||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
|
||||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
|
||||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
|
||||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
|
||||||
echo "ERROR: Failed to extract image tag from docker load output"
|
|
||||||
echo "Load output was: $${LOAD_OUTPUT}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
|
||||||
|
|
||||||
# Prepare base wheels for Docker build context
|
|
||||||
mkdir -p docker/context/base-wheels
|
|
||||||
touch docker/context/base-wheels/.keep
|
|
||||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
|
||||||
echo "Base wheels for vLLM build:"
|
|
||||||
ls -lh docker/context/base-wheels/
|
|
||||||
|
|
||||||
# Get GPU architectures from meta-data
|
|
||||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
|
||||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "Building vLLM wheel with:"
|
|
||||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
|
||||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
|
||||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
|
||||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
|
||||||
DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--file docker/Dockerfile.rocm \
|
|
||||||
--target export_vllm_wheel_release \
|
|
||||||
--output type=local,dest=rocm-dist \
|
|
||||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
|
||||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg REMOTE_VLLM=0 \
|
|
||||||
--build-arg GIT_REPO_CHECK=1 \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
.
|
|
||||||
|
|
||||||
echo "Built vLLM wheel:"
|
|
||||||
ls -lh rocm-dist/*.whl
|
|
||||||
|
|
||||||
# Copy wheel to artifacts directory
|
|
||||||
mkdir -p artifacts/rocm-vllm-wheel
|
|
||||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
|
||||||
echo "Final vLLM wheel:"
|
|
||||||
ls -lh artifacts/rocm-vllm-wheel/
|
|
||||||
artifact_paths:
|
|
||||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 3: Upload Wheels to S3
|
|
||||||
- label: ":s3: Upload ROCm Wheels to S3"
|
|
||||||
id: upload-rocm-wheels
|
|
||||||
depends_on:
|
|
||||||
- step: build-rocm-vllm-wheel
|
|
||||||
allow_failure: false
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
commands:
|
|
||||||
# Download all wheel artifacts and run upload
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
|
||||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
|
||||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
|
||||||
# Try to get from meta-data (input form)
|
|
||||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "Upload check:"
|
|
||||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
|
||||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Skip upload if not enabled
|
|
||||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
|
||||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
|
||||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Upload enabled, proceeding..."
|
|
||||||
|
|
||||||
# Download artifacts from current build
|
|
||||||
echo "Downloading artifacts from current build"
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
|
||||||
|
|
||||||
# Run upload script
|
|
||||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
|
||||||
- label: ":memo: Annotate ROCm wheel release"
|
|
||||||
id: annotate-rocm-release
|
|
||||||
depends_on:
|
|
||||||
- step: upload-rocm-wheels
|
|
||||||
allow_failure: true
|
|
||||||
- step: input-release-version
|
|
||||||
allow_failure: true
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
|
||||||
env:
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
|
|
||||||
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
|
|
||||||
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
|
|
||||||
- block: "Generate Root Index for ROCm Wheels for Release"
|
|
||||||
key: block-generate-root-index-rocm-wheels
|
|
||||||
depends_on: upload-rocm-wheels
|
|
||||||
|
|
||||||
- label: ":package: Generate Root Index for ROCm Wheels for Release"
|
|
||||||
depends_on: block-generate-root-index-rocm-wheels
|
|
||||||
id: generate-root-index-rocm-wheels
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
|
||||||
env:
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
VARIANT: "rocm700"
|
|
||||||
|
|
||||||
# ROCm Job 5: Build ROCm Release Docker Image
|
|
||||||
- label: ":docker: Build release image - x86_64 - ROCm"
|
|
||||||
id: build-rocm-release-image
|
|
||||||
depends_on:
|
|
||||||
- step: build-rocm-base-wheels
|
|
||||||
allow_failure: false
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
commands:
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Login to ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | \
|
|
||||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
|
||||||
|
|
||||||
# Download Docker image from S3 (set by build-rocm-base-wheels)
|
|
||||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
|
||||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
|
||||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
|
|
||||||
mkdir -p artifacts/rocm-docker-image
|
|
||||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
|
||||||
|
|
||||||
# Load base Docker image
|
|
||||||
echo "Loading base Docker image..."
|
|
||||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
|
||||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
|
||||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
|
||||||
|
|
||||||
# Tag and push the base image to ECR
|
|
||||||
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
|
||||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
|
||||||
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
|
|
||||||
|
|
||||||
# Get GPU architectures from meta-data
|
|
||||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
|
||||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
|
||||||
|
|
||||||
# Build vLLM ROCm release image using cached base
|
|
||||||
DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
|
||||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
|
|
||||||
--target vllm-openai \
|
|
||||||
--progress plain \
|
|
||||||
-f docker/Dockerfile.rocm .
|
|
||||||
|
|
||||||
# Push to ECR
|
|
||||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
|
||||||
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
@@ -1,90 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
|
||||||
if [ -z "${RELEASE_VERSION}" ]; then
|
|
||||||
RELEASE_VERSION="1.0.0.dev"
|
|
||||||
fi
|
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|
||||||
To download the wheel (by commit):
|
|
||||||
\`\`\`
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
|
|
||||||
|
|
||||||
(Optional) For CUDA 13.0:
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
|
|
||||||
|
|
||||||
(Optional) For CPU:
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
|
|
||||||
To download and upload the image:
|
|
||||||
|
|
||||||
\`\`\`
|
|
||||||
Download images:
|
|
||||||
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
|
||||||
|
|
||||||
Tag and push images:
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
|
||||||
docker push vllm/vllm-openai:latest-x86_64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
|
||||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
|
||||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
|
||||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker push vllm/vllm-openai:latest-aarch64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
|
||||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
|
||||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
|
||||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
|
|
||||||
docker push vllm/vllm-openai-rocm:latest
|
|
||||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
|
|
||||||
|
|
||||||
Create multi-arch manifest:
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
|
||||||
docker push vllm/vllm-openai-rocm:latest-base
|
|
||||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker manifest push vllm/vllm-openai:latest
|
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
|
||||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
|
||||||
docker manifest push vllm/vllm-openai:latest-cu130
|
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
|
||||||
\`\`\`
|
|
||||||
EOF
|
|
||||||
@@ -1,112 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Generate Buildkite annotation for ROCm wheel release
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get build configuration from meta-data
|
|
||||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
|
||||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
|
||||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
|
||||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
|
||||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
|
||||||
|
|
||||||
# TODO: Enable the nightly build for ROCm
|
|
||||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
|
||||||
if [ -z "${RELEASE_VERSION}" ]; then
|
|
||||||
RELEASE_VERSION="1.0.0.dev"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# S3 URLs
|
|
||||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
|
||||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
|
||||||
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
|
||||||
|
|
||||||
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
|
||||||
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
|
|
||||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
|
||||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
|
||||||
## ROCm Wheel and Docker Image Releases
|
|
||||||
### Build Configuration
|
|
||||||
| Setting | Value |
|
|
||||||
|---------|-------|
|
|
||||||
| **ROCm Version** | ${ROCM_VERSION} |
|
|
||||||
| **Python Version** | ${PYTHON_VERSION} |
|
|
||||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
|
||||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
|
||||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
|
||||||
|
|
||||||
### :package: Installation
|
|
||||||
|
|
||||||
**Install from this build (by commit):**
|
|
||||||
|
|
||||||
\`\`\`bash
|
|
||||||
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
|
||||||
|
|
||||||
# Example for ROCm ${ROCM_VERSION}:
|
|
||||||
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
**Install from nightly (if published):**
|
|
||||||
|
|
||||||
\`\`\`bash
|
|
||||||
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
### :floppy_disk: Download Wheels Directly
|
|
||||||
|
|
||||||
\`\`\`bash
|
|
||||||
# List all ROCm wheels
|
|
||||||
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
|
|
||||||
# Download specific wheels
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
### :gear: Included Packages
|
|
||||||
- **vllm**: vLLM with ROCm support
|
|
||||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
|
||||||
- **triton**: Triton
|
|
||||||
- **triton-kernels**: Triton kernels
|
|
||||||
- **torchvision**: TorchVision for ROCm PyTorch
|
|
||||||
- **torchaudio**: Torchaudio for ROCm PyTorch
|
|
||||||
- **amdsmi**: AMD SMI Python bindings
|
|
||||||
- **aiter**: Aiter for ROCm
|
|
||||||
- **flash-attn**: Flash Attention for ROCm
|
|
||||||
|
|
||||||
### :warning: Notes
|
|
||||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
|
||||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
|
||||||
- Platform: Linux x86_64 only
|
|
||||||
|
|
||||||
### :package: Docker Image Release
|
|
||||||
|
|
||||||
To download and upload the image:
|
|
||||||
|
|
||||||
\`\`\`
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
|
||||||
docker push vllm/vllm-openai-rocm:latest-base
|
|
||||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
|
||||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
|
||||||
docker push vllm/vllm-openai-rocm:latest
|
|
||||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
EOF
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Cache helper for ROCm base wheels
|
|
||||||
#
|
|
||||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
|
||||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
|
||||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
|
||||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
|
||||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
|
||||||
#
|
|
||||||
# Environment variables:
|
|
||||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
|
||||||
# PYTHON_VERSION - Python version (affects cache key)
|
|
||||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
|
||||||
#
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
|
||||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
|
||||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
|
||||||
CACHE_PREFIX="rocm/cache"
|
|
||||||
|
|
||||||
# Generate hash from Dockerfile content + build args
|
|
||||||
generate_cache_key() {
|
|
||||||
# Include Dockerfile content
|
|
||||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
|
||||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
|
||||||
|
|
||||||
# Include key build args that affect the output
|
|
||||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
|
||||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
|
||||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
|
||||||
|
|
||||||
echo "${dockerfile_hash}-${args_hash}"
|
|
||||||
}
|
|
||||||
|
|
||||||
CACHE_KEY=$(generate_cache_key)
|
|
||||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
|
||||||
|
|
||||||
case "${1:-}" in
|
|
||||||
check)
|
|
||||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
|
||||||
echo "Cache path: ${CACHE_PATH}" >&2
|
|
||||||
echo "Variables used in cache key:" >&2
|
|
||||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
|
||||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
|
||||||
|
|
||||||
# Check if cache exists by listing objects
|
|
||||||
# We look for at least one .whl file
|
|
||||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
|
||||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
|
||||||
echo "S3 ls output:" >&2
|
|
||||||
echo "$S3_OUTPUT" | head -5 >&2
|
|
||||||
|
|
||||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
|
||||||
echo "hit"
|
|
||||||
else
|
|
||||||
echo "miss"
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
|
|
||||||
upload)
|
|
||||||
echo "========================================"
|
|
||||||
echo "Uploading wheels to cache"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Cache key: ${CACHE_KEY}"
|
|
||||||
echo "Cache path: ${CACHE_PATH}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
|
||||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
|
||||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
|
||||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Uploading $WHEEL_COUNT wheels..."
|
|
||||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Cache upload complete!"
|
|
||||||
echo "========================================"
|
|
||||||
;;
|
|
||||||
|
|
||||||
download)
|
|
||||||
echo "========================================"
|
|
||||||
echo "Downloading wheels from cache"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Cache key: ${CACHE_KEY}"
|
|
||||||
echo "Cache path: ${CACHE_PATH}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
mkdir -p artifacts/rocm-base-wheels
|
|
||||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Downloaded wheels:"
|
|
||||||
ls -lh artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
|
||||||
echo ""
|
|
||||||
echo "Total: $WHEEL_COUNT wheels"
|
|
||||||
echo "========================================"
|
|
||||||
;;
|
|
||||||
|
|
||||||
key)
|
|
||||||
echo "${CACHE_KEY}"
|
|
||||||
;;
|
|
||||||
|
|
||||||
path)
|
|
||||||
echo "${CACHE_PATH}"
|
|
||||||
;;
|
|
||||||
|
|
||||||
*)
|
|
||||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
|
||||||
echo "" >&2
|
|
||||||
echo "Commands:" >&2
|
|
||||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
|
||||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
|
||||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
|
||||||
echo " key - Output the cache key" >&2
|
|
||||||
echo " path - Output the full S3 cache path" >&2
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
@@ -1,242 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# cherry-pick-from-milestone.sh
|
|
||||||
# Find commits from a GitHub milestone that are missing from the current branch
|
|
||||||
# and output them in chronological order for cherry-picking.
|
|
||||||
#
|
|
||||||
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
|
|
||||||
#
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Colors for output
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<EOF
|
|
||||||
Usage: $(basename "$0") <milestone> [options]
|
|
||||||
|
|
||||||
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
milestone The GitHub milestone name (e.g., v0.14.0)
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--dry-run Show the cherry-pick commands without executing (default)
|
|
||||||
--execute Actually execute the cherry-picks
|
|
||||||
--main-branch Specify the main branch name (default: main)
|
|
||||||
--help Show this help message
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
$(basename "$0") v0.14.0
|
|
||||||
$(basename "$0") v0.14.0 --dry-run
|
|
||||||
$(basename "$0") v0.14.0 --execute
|
|
||||||
$(basename "$0") v0.14.0 --main-branch master
|
|
||||||
EOF
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
log_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_success() {
|
|
||||||
echo -e "${GREEN}[OK]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_warn() {
|
|
||||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
MILESTONE=""
|
|
||||||
DRY_RUN=true
|
|
||||||
MAIN_BRANCH="main"
|
|
||||||
|
|
||||||
# Parse arguments
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case $1 in
|
|
||||||
--dry-run)
|
|
||||||
DRY_RUN=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--execute)
|
|
||||||
DRY_RUN=false
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--main-branch)
|
|
||||||
MAIN_BRANCH="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--help|-h)
|
|
||||||
usage
|
|
||||||
;;
|
|
||||||
-*)
|
|
||||||
log_error "Unknown option: $1"
|
|
||||||
usage
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
if [[ -z "$MILESTONE" ]]; then
|
|
||||||
MILESTONE="$1"
|
|
||||||
else
|
|
||||||
log_error "Unexpected argument: $1"
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Validate milestone argument
|
|
||||||
if [[ -z "$MILESTONE" ]]; then
|
|
||||||
log_error "Milestone is required"
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if we're in a git repository
|
|
||||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
|
||||||
log_error "Not in a git repository"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if gh CLI is available
|
|
||||||
if ! command -v gh &>/dev/null; then
|
|
||||||
log_error "GitHub CLI (gh) is not installed"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if authenticated with gh
|
|
||||||
if ! gh auth status &>/dev/null; then
|
|
||||||
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CURRENT_BRANCH=$(git branch --show-current)
|
|
||||||
log_info "Current branch: ${CURRENT_BRANCH}"
|
|
||||||
log_info "Main branch: ${MAIN_BRANCH}"
|
|
||||||
log_info "Milestone: ${MILESTONE}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Fetch latest from remote
|
|
||||||
log_info "Fetching latest from remote..."
|
|
||||||
git fetch origin "$MAIN_BRANCH" --quiet
|
|
||||||
|
|
||||||
# Get merged PRs from the milestone, sorted by merge date
|
|
||||||
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
|
||||||
|
|
||||||
# Store PR data in a temp file
|
|
||||||
PR_DATA=$(mktemp)
|
|
||||||
trap "rm -f $PR_DATA" EXIT
|
|
||||||
|
|
||||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
|
||||||
--limit 1000 \
|
|
||||||
--json number,title,mergeCommit,mergedAt \
|
|
||||||
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
|
|
||||||
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
|
|
||||||
log_error "This could be due to:"
|
|
||||||
log_error " - Milestone does not exist"
|
|
||||||
log_error " - Network/authentication issues"
|
|
||||||
log_error " - Invalid milestone name format"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -s "$PR_DATA" ]]; then
|
|
||||||
log_warn "No merged PRs found for milestone '${MILESTONE}'"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
TOTAL_PRS=$(wc -l < "$PR_DATA")
|
|
||||||
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Find commits that are missing from current branch
|
|
||||||
MISSING_COMMITS=()
|
|
||||||
MISSING_INFO=()
|
|
||||||
|
|
||||||
while IFS=$'\t' read -r sha pr_number title; do
|
|
||||||
# Skip if SHA is empty or null
|
|
||||||
if [[ -z "$sha" || "$sha" == "null" ]]; then
|
|
||||||
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if this commit is already in the current branch
|
|
||||||
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
|
|
||||||
log_success "PR #${pr_number} already in branch: ${title:0:60}"
|
|
||||||
else
|
|
||||||
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
|
|
||||||
MISSING_COMMITS+=("$sha")
|
|
||||||
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
|
|
||||||
fi
|
|
||||||
done < "$PR_DATA"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
|
|
||||||
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Output the cherry-pick commands
|
|
||||||
echo "=========================================="
|
|
||||||
echo "Cherry-pick commands (in chronological order):"
|
|
||||||
echo "=========================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
for info in "${MISSING_INFO[@]}"; do
|
|
||||||
echo "# $info"
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
echo "# Run these commands to cherry-pick all missing commits:"
|
|
||||||
echo "git cherry-pick ${MISSING_COMMITS[*]}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Or one by one
|
|
||||||
echo "# Or cherry-pick one at a time:"
|
|
||||||
for sha in "${MISSING_COMMITS[@]}"; do
|
|
||||||
echo "git cherry-pick $sha"
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Execute if requested
|
|
||||||
if [[ "$DRY_RUN" == false ]]; then
|
|
||||||
echo "=========================================="
|
|
||||||
log_info "Executing cherry-picks..."
|
|
||||||
echo "=========================================="
|
|
||||||
|
|
||||||
for i in "${!MISSING_COMMITS[@]}"; do
|
|
||||||
sha="${MISSING_COMMITS[$i]}"
|
|
||||||
info="${MISSING_INFO[$i]}"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
log_info "Cherry-picking: $info"
|
|
||||||
|
|
||||||
if git cherry-pick "$sha"; then
|
|
||||||
log_success "Successfully cherry-picked $sha"
|
|
||||||
else
|
|
||||||
log_error "Failed to cherry-pick $sha"
|
|
||||||
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
log_success "All cherry-picks completed successfully!"
|
|
||||||
else
|
|
||||||
echo "=========================================="
|
|
||||||
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
|
|
||||||
echo "Run with --execute to perform the cherry-picks"
|
|
||||||
echo "=========================================="
|
|
||||||
fi
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Usage: ./ci_clean_log.sh ci.log
|
|
||||||
# This script strips timestamps and color codes from CI log files.
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 ci.log"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
INPUT_FILE="$1"
|
|
||||||
|
|
||||||
# Strip timestamps
|
|
||||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
|
||||||
|
|
||||||
# Strip colorization
|
|
||||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
|
||||||
@@ -1,127 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
|
||||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
|
||||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
|
||||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
|
||||||
|
|
||||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
|
||||||
TAG_PREFIX="${1:-nightly-}"
|
|
||||||
|
|
||||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
|
||||||
|
|
||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
|
||||||
|
|
||||||
# Get DockerHub credentials from environment
|
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
|
||||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get DockerHub bearer token
|
|
||||||
echo "Getting DockerHub bearer token..."
|
|
||||||
set +x
|
|
||||||
BEARER_TOKEN=$(curl -s -X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
|
||||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
|
||||||
echo "Error: Failed to get DockerHub bearer token"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
|
||||||
get_all_tags() {
|
|
||||||
local page=1
|
|
||||||
local all_tags=""
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
|
||||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
|
||||||
|
|
||||||
if [ -z "$tags" ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
all_tags="$all_tags$tags"$'\n'
|
|
||||||
page=$((page + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
# Sort by timestamp (newest first) and extract just the tag names
|
|
||||||
echo "$all_tags" | sort -r | cut -d'|' -f2
|
|
||||||
}
|
|
||||||
|
|
||||||
delete_tag() {
|
|
||||||
local tag_name="$1"
|
|
||||||
echo "Deleting tag: $tag_name"
|
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
|
||||||
else
|
|
||||||
echo "Successfully deleted tag: $tag_name"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
|
|
||||||
echo "Fetching all tags from DockerHub..."
|
|
||||||
all_tags=$(get_all_tags)
|
|
||||||
|
|
||||||
if [ -z "$all_tags" ]; then
|
|
||||||
echo "No tags found to clean up"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Count total tags
|
|
||||||
total_tags=$(echo "$all_tags" | wc -l)
|
|
||||||
echo "Found $total_tags tags"
|
|
||||||
|
|
||||||
# Keep only the last 14 builds (including the current one)
|
|
||||||
tags_to_keep=14
|
|
||||||
tags_to_delete=$((total_tags - tags_to_keep))
|
|
||||||
|
|
||||||
if [ $tags_to_delete -le 0 ]; then
|
|
||||||
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
|
|
||||||
|
|
||||||
# Get tags to delete (skip the first $tags_to_keep tags)
|
|
||||||
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
|
|
||||||
|
|
||||||
if [ -z "$tags_to_delete_list" ]; then
|
|
||||||
echo "No tags to delete"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Delete old tags
|
|
||||||
echo "Deleting old tags..."
|
|
||||||
while IFS= read -r tag; do
|
|
||||||
if [ -n "$tag" ]; then
|
|
||||||
delete_tag "$tag"
|
|
||||||
# Add a small delay to avoid rate limiting
|
|
||||||
sleep 1
|
|
||||||
fi
|
|
||||||
done <<< "$tags_to_delete_list"
|
|
||||||
|
|
||||||
echo "Cleanup completed successfully"
|
|
||||||
@@ -1,468 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# do not complain about line length (for docstring)
|
|
||||||
# ruff: noqa: E501
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from dataclasses import asdict, dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
import regex as re
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_package_name(name: str) -> str:
|
|
||||||
"""
|
|
||||||
Normalize package name according to PEP 503.
|
|
||||||
https://peps.python.org/pep-0503/#normalized-names
|
|
||||||
|
|
||||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
|
||||||
and lowercase the result.
|
|
||||||
"""
|
|
||||||
return re.sub(r"[-_.]+", "-", name).lower()
|
|
||||||
|
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
|
||||||
|
|
||||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<!-- {comment} -->
|
|
||||||
<meta name="pypi:repository-version" content="1.0">
|
|
||||||
<body>
|
|
||||||
{items}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class WheelFileInfo:
|
|
||||||
package_name: str
|
|
||||||
version: str
|
|
||||||
build_tag: str | None
|
|
||||||
python_tag: str
|
|
||||||
abi_tag: str
|
|
||||||
platform_tag: str
|
|
||||||
variant: str | None
|
|
||||||
filename: str
|
|
||||||
|
|
||||||
|
|
||||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
|
||||||
"""
|
|
||||||
Parse wheel file name to extract metadata.
|
|
||||||
|
|
||||||
The format of wheel names:
|
|
||||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
|
||||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
|
||||||
Example:
|
|
||||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
"""
|
|
||||||
wheel_file_re = re.compile(
|
|
||||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
|
||||||
)
|
|
||||||
match = wheel_file_re.match(file)
|
|
||||||
if not match:
|
|
||||||
raise ValueError(f"Invalid wheel file name: {file}")
|
|
||||||
|
|
||||||
package_name = match.group("package_name")
|
|
||||||
version = match.group("version")
|
|
||||||
build_tag = match.group("build_tag")
|
|
||||||
python_tag = match.group("python_tag")
|
|
||||||
abi_tag = match.group("abi_tag")
|
|
||||||
platform_tag = match.group("platform_tag")
|
|
||||||
|
|
||||||
# extract variant from version
|
|
||||||
variant = None
|
|
||||||
if "dev" in version:
|
|
||||||
ver_after_dev = version.split("dev")[-1]
|
|
||||||
if "." in ver_after_dev:
|
|
||||||
variant = ver_after_dev.split(".")[-1]
|
|
||||||
version = version.removesuffix("." + variant)
|
|
||||||
else:
|
|
||||||
if "+" in version:
|
|
||||||
version_part, suffix = version.split("+", 1)
|
|
||||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
|
||||||
# Git hashes and other suffixes are NOT variants
|
|
||||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
|
||||||
variant = suffix
|
|
||||||
version = version_part
|
|
||||||
# Otherwise keep the full version string (variant stays None)
|
|
||||||
|
|
||||||
return WheelFileInfo(
|
|
||||||
package_name=package_name,
|
|
||||||
version=version,
|
|
||||||
build_tag=build_tag,
|
|
||||||
python_tag=python_tag,
|
|
||||||
abi_tag=abi_tag,
|
|
||||||
platform_tag=platform_tag,
|
|
||||||
variant=variant,
|
|
||||||
filename=file,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
|
||||||
"""
|
|
||||||
Generate project list HTML content linking to each project & variant subdirectory.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
for name in sorted(subdir_names):
|
|
||||||
name = name.strip("/").strip(".")
|
|
||||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
|
||||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_package_index_and_metadata(
|
|
||||||
wheel_files: list[WheelFileInfo],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
comment: str = "",
|
|
||||||
) -> tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
metadata = []
|
|
||||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
|
||||||
relative_path = (
|
|
||||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
|
||||||
)
|
|
||||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
|
||||||
# NOTE: this is AWS S3 specific behavior!
|
|
||||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
|
||||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
|
||||||
file_meta = asdict(file)
|
|
||||||
file_meta["path"] = file_path_quoted
|
|
||||||
metadata.append(file_meta)
|
|
||||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
metadata_str = json.dumps(metadata, indent=2)
|
|
||||||
return index_str, metadata_str
|
|
||||||
|
|
||||||
|
|
||||||
def generate_index_and_metadata(
|
|
||||||
whl_files: list[str],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
default_variant: str | None = None,
|
|
||||||
alias_to_default: str | None = None,
|
|
||||||
comment: str = "",
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Generate index for all wheel files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
|
||||||
wheel_base_dir (Path): Base directory for wheel files.
|
|
||||||
index_base_dir (Path): Base directory to store index files.
|
|
||||||
default_variant (str | None): The default variant name, if any.
|
|
||||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
|
||||||
comment (str | None): Optional comment to include in the generated HTML files.
|
|
||||||
|
|
||||||
First, parse all wheel files to extract metadata.
|
|
||||||
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
|
|
||||||
The index for the default variant (if any) is generated in the root index directory.
|
|
||||||
|
|
||||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
|
||||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
|
||||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
|
||||||
|
|
||||||
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
|
|
||||||
as the default variant index, but the links are adjusted accordingly.
|
|
||||||
|
|
||||||
Index directory structure:
|
|
||||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
|
||||||
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
|
|
||||||
vllm/
|
|
||||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
|
||||||
metadata.json # machine-readable metadata for all wheels in this package
|
|
||||||
cpu/ # cpu variant subdirectory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu129/ # cu129 is actually the alias to default variant
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu130/ # cu130 variant subdirectory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
...
|
|
||||||
|
|
||||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"package_name": "vllm",
|
|
||||||
"version": "0.10.2rc2",
|
|
||||||
"build_tag": null,
|
|
||||||
"python_tag": "cp38",
|
|
||||||
"abi_tag": "abi3",
|
|
||||||
"platform_tag": "manylinux2014_aarch64",
|
|
||||||
"variant": "cu129",
|
|
||||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
|
||||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
|
||||||
|
|
||||||
if not parsed_files:
|
|
||||||
print("No wheel files found, skipping index generation.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# For ROCm builds: inherit variant from vllm wheel
|
|
||||||
# All ROCm wheels should share the same variant as vllm
|
|
||||||
rocm_variant = None
|
|
||||||
for file in parsed_files:
|
|
||||||
if (
|
|
||||||
file.package_name == "vllm"
|
|
||||||
and file.variant
|
|
||||||
and file.variant.startswith("rocm")
|
|
||||||
):
|
|
||||||
rocm_variant = file.variant
|
|
||||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
|
||||||
break
|
|
||||||
|
|
||||||
# Apply ROCm variant to all wheels without a variant
|
|
||||||
if rocm_variant:
|
|
||||||
for file in parsed_files:
|
|
||||||
if file.variant is None:
|
|
||||||
file.variant = rocm_variant
|
|
||||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
|
||||||
|
|
||||||
# Group by variant
|
|
||||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
|
||||||
for file in parsed_files:
|
|
||||||
variant = file.variant or "default"
|
|
||||||
if variant not in variant_to_files:
|
|
||||||
variant_to_files[variant] = []
|
|
||||||
variant_to_files[variant].append(file)
|
|
||||||
|
|
||||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
|
||||||
|
|
||||||
# sanity check for default variant
|
|
||||||
if default_variant:
|
|
||||||
if "default" in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
|
||||||
)
|
|
||||||
if default_variant not in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Default variant '{default_variant}' not found among wheel files."
|
|
||||||
)
|
|
||||||
|
|
||||||
if alias_to_default:
|
|
||||||
if "default" not in variant_to_files:
|
|
||||||
# e.g. only some wheels are uploaded to S3 currently
|
|
||||||
print(
|
|
||||||
"[WARN] Alias to default variant specified, but no default variant found."
|
|
||||||
)
|
|
||||||
elif alias_to_default in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
|
||||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
|
||||||
|
|
||||||
# Generate comment in HTML header
|
|
||||||
comment_str = f" ({comment})" if comment else ""
|
|
||||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
|
||||||
|
|
||||||
# Generate index for each variant
|
|
||||||
subdir_names = set()
|
|
||||||
for variant, files in variant_to_files.items():
|
|
||||||
if variant == "default":
|
|
||||||
variant_dir = index_base_dir
|
|
||||||
else:
|
|
||||||
variant_dir = index_base_dir / variant
|
|
||||||
subdir_names.add(variant)
|
|
||||||
|
|
||||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# gather all package names in this variant (normalized per PEP 503)
|
|
||||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
|
||||||
if variant == "default":
|
|
||||||
# these packages should also appear in the "project list"
|
|
||||||
# generate after all variants are processed
|
|
||||||
subdir_names = subdir_names.union(packages)
|
|
||||||
else:
|
|
||||||
# generate project list for this variant directly
|
|
||||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
|
||||||
with open(variant_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
for package in packages:
|
|
||||||
# filter files belonging to this package only (compare normalized names)
|
|
||||||
package_files = [
|
|
||||||
f for f in files if normalize_package_name(f.package_name) == package
|
|
||||||
]
|
|
||||||
package_dir = variant_dir / package
|
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
|
||||||
package_files, wheel_base_dir, package_dir, comment
|
|
||||||
)
|
|
||||||
with open(package_dir / "index.html", "w") as f:
|
|
||||||
f.write(index_str)
|
|
||||||
with open(package_dir / "metadata.json", "w") as f:
|
|
||||||
f.write(metadata_str)
|
|
||||||
|
|
||||||
# Generate top-level project list index
|
|
||||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
|
||||||
with open(index_base_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
"""
|
|
||||||
Arguments:
|
|
||||||
--version <version> : version string for the current build (e.g., commit hash)
|
|
||||||
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
|
|
||||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
|
||||||
--output-dir <output_directory> : directory to store generated index files
|
|
||||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
|
||||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
|
||||||
"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Process nightly build wheel files to generate indices."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--version",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Version string for the current build (e.g., commit hash)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--current-objects",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory to store generated index files",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--wheel-dir",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Directory containing wheel files (default to be same as `version`)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--alias-to-default",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Alias variant name for the default variant",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--comment",
|
|
||||||
type=str,
|
|
||||||
default="",
|
|
||||||
help="Optional comment string to include in generated HTML files",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
version = args.version
|
|
||||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
|
||||||
if "\\" in version:
|
|
||||||
raise ValueError("Version string must not contain backslashes.")
|
|
||||||
if "/" in version and not version.startswith("rocm/"):
|
|
||||||
raise ValueError(
|
|
||||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
|
||||||
)
|
|
||||||
current_objects_path = Path(args.current_objects)
|
|
||||||
output_dir = Path(args.output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Read current objects JSON
|
|
||||||
with open(current_objects_path) as f:
|
|
||||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
|
||||||
|
|
||||||
# current_objects looks like from list_objects_v2 S3 API:
|
|
||||||
"""
|
|
||||||
"Contents": [
|
|
||||||
{
|
|
||||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
|
||||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
|
||||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
|
||||||
"ChecksumAlgorithm": [
|
|
||||||
"CRC64NVME"
|
|
||||||
],
|
|
||||||
"ChecksumType": "FULL_OBJECT",
|
|
||||||
"Size": 435649349,
|
|
||||||
"StorageClass": "STANDARD"
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Extract wheel file keys
|
|
||||||
wheel_files = []
|
|
||||||
for item in current_objects.get("Contents", []):
|
|
||||||
key: str = item["Key"]
|
|
||||||
if key.endswith(".whl"):
|
|
||||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
|
||||||
|
|
||||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
|
||||||
|
|
||||||
# keep only "official" files for a non-nightly version (specified by cli args)
|
|
||||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
|
||||||
if PY_VERSION_RE.match(version):
|
|
||||||
# upload-wheels.sh ensures no "dev" is in args.version
|
|
||||||
wheel_files = list(
|
|
||||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
|
||||||
)
|
|
||||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
|
||||||
else:
|
|
||||||
print("Nightly version detected, keeping all wheel files.")
|
|
||||||
|
|
||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
|
||||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
|
||||||
#
|
|
||||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
|
||||||
# - rocm/{commit}/ (same as wheels)
|
|
||||||
# - rocm/nightly/
|
|
||||||
# - rocm/{version}/
|
|
||||||
# All these are under the "rocm/" prefix, so relative paths should be
|
|
||||||
# relative to "rocm/", not the bucket root.
|
|
||||||
if args.wheel_dir:
|
|
||||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
|
||||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
|
||||||
elif version.startswith("rocm/"):
|
|
||||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
|
||||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
|
||||||
wheel_dir = version.split("/", 1)[1]
|
|
||||||
else:
|
|
||||||
wheel_dir = version
|
|
||||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
|
||||||
index_base_dir = Path(output_dir)
|
|
||||||
|
|
||||||
generate_index_and_metadata(
|
|
||||||
whl_files=wheel_files,
|
|
||||||
wheel_base_dir=wheel_base_dir,
|
|
||||||
index_base_dir=index_base_dir,
|
|
||||||
default_variant=None,
|
|
||||||
alias_to_default=args.alias_to_default,
|
|
||||||
comment=args.comment.strip(),
|
|
||||||
)
|
|
||||||
print(f"Successfully generated index and metadata in {output_dir}")
|
|
||||||
@@ -1,284 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script runs test inside the corresponding ROCm docker container.
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
# Export Python path
|
|
||||||
export PYTHONPATH=".."
|
|
||||||
|
|
||||||
# Print ROCm version
|
|
||||||
echo "--- Confirming Clean Initial State"
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- ROCm info"
|
|
||||||
rocminfo
|
|
||||||
|
|
||||||
# cleanup older docker images
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup_network() {
|
|
||||||
for node in $(seq 0 $((NUM_NODES-1))); do
|
|
||||||
if docker pr -a -q -f name="node${node}" | grep -q .; then
|
|
||||||
docker stop "node${node}"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
if docker network ls | grep docker-net; then
|
|
||||||
docker network rm docker-net
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Call the cleanup docker function
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
echo "--- Resetting GPUs"
|
|
||||||
|
|
||||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- Pulling container"
|
|
||||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
docker pull "${image_name}"
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
echo "--- Running container"
|
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
|
||||||
mkdir -p "${HF_CACHE}"
|
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
|
||||||
|
|
||||||
commands=$@
|
|
||||||
echo "Commands:$commands"
|
|
||||||
|
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain kernels tests
|
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
||||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
|
||||||
MYPYTHONPATH=".."
|
|
||||||
|
|
||||||
# Test that we're launching on the machine that has
|
|
||||||
# proper access to GPUs
|
|
||||||
render_gid=$(getent group render | cut -d: -f3)
|
|
||||||
if [[ -z "$render_gid" ]]; then
|
|
||||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
|
||||||
# assign job count as the number of shards used
|
|
||||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
|
||||||
# assign shard-id for each shard
|
|
||||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--group-add "$render_gid" \
|
|
||||||
--rm \
|
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}_${GPU}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands_gpu}" \
|
|
||||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
|
||||||
PIDS+=($!)
|
|
||||||
done
|
|
||||||
#wait for all processes to finish and collect exit codes
|
|
||||||
for pid in "${PIDS[@]}"; do
|
|
||||||
wait "${pid}"
|
|
||||||
STATUS+=($?)
|
|
||||||
done
|
|
||||||
at_least_one_shard_with_tests=0
|
|
||||||
for st in "${STATUS[@]}"; do
|
|
||||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
|
||||||
echo "One of the processes failed with $st"
|
|
||||||
exit "${st}"
|
|
||||||
elif [[ ${st} -eq 5 ]]; then
|
|
||||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
|
||||||
else # This means st is 0
|
|
||||||
at_least_one_shard_with_tests=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
|
||||||
echo "All shards reported no tests collected. Failing the build."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
|
||||||
|
|
||||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
|
||||||
|
|
||||||
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
|
|
||||||
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
|
||||||
echo "PREFIX: ${prefix}"
|
|
||||||
export composite_command="(command rocm-smi || true)"
|
|
||||||
myIFS=$IFS
|
|
||||||
IFS=','
|
|
||||||
read -ra node0 <<< ${BASH_REMATCH[2]}
|
|
||||||
read -ra node1 <<< ${BASH_REMATCH[3]}
|
|
||||||
IFS=$myIFS
|
|
||||||
for i in "${!node0[@]}";do
|
|
||||||
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
|
|
||||||
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
|
|
||||||
|
|
||||||
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
|
||||||
echo "COMMANDS: ${commands}"
|
|
||||||
composite_command=$(echo "${composite_command} && ${commands}")
|
|
||||||
done
|
|
||||||
/bin/bash -c "${composite_command}"
|
|
||||||
cleanup_network
|
|
||||||
else
|
|
||||||
echo "Failed to parse node commands! Exiting."
|
|
||||||
cleanup_network
|
|
||||||
exit 111
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--group-add "$render_gid" \
|
|
||||||
--rm \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands}"
|
|
||||||
fi
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# allow to bind to different cores
|
|
||||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run model tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
|
||||||
|
|
||||||
# basic online serving
|
|
||||||
docker exec cpu-test bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model Qwen/Qwen3-0.6B \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2h bash -c cpu_tests
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman stop --all -t0
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
export TORCH_COMPILE_DISABLE=1
|
|
||||||
set -xve
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
export TORCH_COMPILE_DISABLE=1
|
|
||||||
set -evx
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator tblib
|
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 120m bash -c cpu_tests
|
|
||||||
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
|
||||||
@@ -1,120 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# allow to bind to different cores
|
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
|
||||||
# used for TP/PP E2E test
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
export NUMA_NODE=$2
|
|
||||||
|
|
||||||
# list packages
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
# Note: disable until supports V1
|
|
||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
|
||||||
-m cpu_model"
|
|
||||||
|
|
||||||
# Run compressed-tensor test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
|
||||||
|
|
||||||
# Run AWQ/GPTQ test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/quantization/test_cpu_wna16.py"
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/lora/test_qwenvl.py"
|
|
||||||
|
|
||||||
# online serving: tp+pp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
|
|
||||||
# online serving: tp+dp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the GH200 docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
|
||||||
python3 use_existing_torch.py
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
|
||||||
--platform "linux/arm64" \
|
|
||||||
-t gh200-test \
|
|
||||||
--build-arg max_jobs=66 \
|
|
||||||
--build-arg nvcc_threads=2 \
|
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX"
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image and test offline inference
|
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
|
||||||
'
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -exuo pipefail
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
|
||||||
FROM gaudi-base-image:latest
|
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
ENV no_proxy=localhost,127.0.0.1
|
|
||||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
|
||||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
# certain versions of HPU software stack have a bug that can
|
|
||||||
# override the exit code of the script, so we need to use
|
|
||||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
|
||||||
# functions, while other platforms only need one remove_docker_container
|
|
||||||
# function.
|
|
||||||
EXITCODE=1
|
|
||||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
|
||||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
|
||||||
remove_docker_containers
|
|
||||||
|
|
||||||
echo "Running HPU plugin v1 test"
|
|
||||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
|
||||||
-e HABANA_VISIBLE_DEVICES=all \
|
|
||||||
hpu-plugin-v1-test-env \
|
|
||||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
|
||||||
|
|
||||||
EXITCODE=$?
|
|
||||||
if [ $EXITCODE -eq 0 ]; then
|
|
||||||
echo "Test with basic model passed"
|
|
||||||
else
|
|
||||||
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
|
|
||||||
fi
|
|
||||||
|
|
||||||
# The trap will handle the container removal and final exit.
|
|
||||||
@@ -1,192 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ENV SOC_VERSION="ascend910b1"
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
||||||
@@ -1,166 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 2 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 3 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
run_and_track_test 7 "test_tpu_int8.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@@ -1,174 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 0 "test_perf.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
|
||||||
run_and_track_test 1 "test_compilation.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
|
||||||
run_and_track_test 2 "test_basic.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
|
||||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
|
||||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
|
||||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
|
||||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
|
||||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
|
||||||
run_and_track_test 7 "test_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
|
||||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
|
||||||
run_and_track_test 9 "test_multimodal.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
|
||||||
docker run \
|
|
||||||
--device /dev/dri:/dev/dri \
|
|
||||||
--net=host \
|
|
||||||
--ipc=host \
|
|
||||||
--privileged \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
|
||||||
--entrypoint="" \
|
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
echo $ZE_AFFINITY_MASK
|
|
||||||
pip install tblib==3.1.0
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
|
||||||
cd tests
|
|
||||||
pytest -v -s v1/core
|
|
||||||
pytest -v -s v1/engine
|
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
|
||||||
pytest -v -s v1/structured_output
|
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
|
||||||
'
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
|
||||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
|
||||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
|
||||||
|
|
||||||
TAG_VARIANT="$1"
|
|
||||||
if [ -n "$TAG_VARIANT" ]; then
|
|
||||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
|
||||||
TAG_NAME="$TAG_VARIANT-nightly"
|
|
||||||
else
|
|
||||||
ORIG_TAG_SUFFIX=""
|
|
||||||
TAG_NAME="nightly"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
|
||||||
|
|
||||||
# pull original arch-dependent images from AWS ECR Public
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
|
||||||
# tag arch-dependent images
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
|
||||||
# push arch-dependent images to DockerHub
|
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
|
||||||
# push arch-independent manifest to DockerHub
|
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 path/to/test.py::test_name"
|
|
||||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
TEST=$1
|
|
||||||
COUNT=1
|
|
||||||
|
|
||||||
while pytest -sv "$TEST"; do
|
|
||||||
COUNT=$((COUNT + 1))
|
|
||||||
echo "RUN NUMBER ${COUNT}"
|
|
||||||
done
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
|
||||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
|
||||||
bench_latency_exit_code=$?
|
|
||||||
|
|
||||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
|
||||||
bench_throughput_exit_code=$?
|
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
|
||||||
server_pid=$!
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--tokenizer meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--save-result \
|
|
||||||
2>&1 | tee benchmark_serving.txt
|
|
||||||
bench_serving_exit_code=$?
|
|
||||||
kill $server_pid
|
|
||||||
|
|
||||||
# write the results into a markdown file
|
|
||||||
echo "### Latency Benchmarks" >> benchmark_results.md
|
|
||||||
sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
|
|
||||||
echo "" >> benchmark_results.md
|
|
||||||
sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
|
|
||||||
|
|
||||||
echo "### Throughput Benchmarks" >> benchmark_results.md
|
|
||||||
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
|
|
||||||
echo "" >> benchmark_results.md
|
|
||||||
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
|
|
||||||
|
|
||||||
echo "### Serving Benchmarks" >> benchmark_results.md
|
|
||||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
|
||||||
echo "" >> benchmark_results.md
|
|
||||||
echo '```' >> benchmark_results.md
|
|
||||||
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
|
||||||
echo '```' >> benchmark_results.md
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /usr/bin/buildkite-agent ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# upload the results to buildkite
|
|
||||||
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
|
||||||
|
|
||||||
# exit with the exit code of the benchmarks
|
|
||||||
if [ $bench_latency_exit_code -ne 0 ]; then
|
|
||||||
exit $bench_latency_exit_code
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $bench_throughput_exit_code -ne 0 ]; then
|
|
||||||
exit $bench_throughput_exit_code
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $bench_serving_exit_code -ne 0 ]; then
|
|
||||||
exit $bench_serving_exit_code
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
buildkite-agent artifact upload "*.json"
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -euox pipefail
|
|
||||||
|
|
||||||
# To detect ROCm
|
|
||||||
# Check multiple indicators:
|
|
||||||
if [ -e /dev/kfd ] || \
|
|
||||||
[ -d /opt/rocm ] || \
|
|
||||||
command -v rocm-smi &> /dev/null || \
|
|
||||||
[ -n "${ROCM_HOME:-}" ]; then
|
|
||||||
IS_ROCM=1
|
|
||||||
else
|
|
||||||
IS_ROCM=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
WORKING_DIR=$1
|
|
||||||
NUM_NODES=$2
|
|
||||||
NUM_GPUS=$3
|
|
||||||
DOCKER_IMAGE=$4
|
|
||||||
|
|
||||||
shift 4
|
|
||||||
COMMANDS=("$@")
|
|
||||||
if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
|
|
||||||
echo "The number of commands must be equal to the number of nodes."
|
|
||||||
echo "Number of nodes: $NUM_NODES"
|
|
||||||
echo "Number of commands: ${#COMMANDS[@]}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "List of commands"
|
|
||||||
for command in "${COMMANDS[@]}"; do
|
|
||||||
echo "$command"
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
start_network() {
|
|
||||||
docker network create --subnet=192.168.10.0/24 docker-net
|
|
||||||
}
|
|
||||||
|
|
||||||
start_nodes() {
|
|
||||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
|
||||||
if [ "$IS_ROCM" -eq 1 ]; then
|
|
||||||
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
|
|
||||||
else
|
|
||||||
GPU_DEVICES='--gpus "device='
|
|
||||||
fi
|
|
||||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
|
||||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
|
||||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
|
||||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
|
||||||
GPU_DEVICES+=','
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
if [ "$IS_ROCM" -eq 0 ]; then
|
|
||||||
GPU_DEVICES+='"'
|
|
||||||
fi
|
|
||||||
|
|
||||||
# start the container in detached mode
|
|
||||||
# things to note:
|
|
||||||
# 1. --shm-size=10.24gb is required. don't use --ipc=host
|
|
||||||
# 2. pass HF_TOKEN to the container
|
|
||||||
# 3. map the huggingface cache directory to the container
|
|
||||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
|
||||||
# starting from 192.168.10.11)
|
|
||||||
docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
|
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
|
||||||
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
|
||||||
/bin/bash -c "tail -f /dev/null"
|
|
||||||
|
|
||||||
# organize containers into a ray cluster
|
|
||||||
if [ "$node" -eq 0 ]; then
|
|
||||||
# start the ray head node
|
|
||||||
docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
|
|
||||||
# wait for the head node to be ready
|
|
||||||
sleep 10
|
|
||||||
else
|
|
||||||
# start the ray worker nodes, and connect them to the head node
|
|
||||||
docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# wait for the cluster to be ready
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# print the cluster status
|
|
||||||
docker exec node0 /bin/bash -c "ray status"
|
|
||||||
}
|
|
||||||
|
|
||||||
run_nodes() {
|
|
||||||
# important: iterate in reverse order to start the head node last
|
|
||||||
# we start the worker nodes first, in detached mode, and then start the head node
|
|
||||||
# in the foreground, so that the output of the head node is visible in the buildkite logs
|
|
||||||
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
|
|
||||||
GPU_DEVICES='"device='
|
|
||||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
|
||||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
|
||||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
|
||||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
|
||||||
GPU_DEVICES+=','
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
GPU_DEVICES+='"'
|
|
||||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
|
||||||
if [ "$node" -ne 0 ]; then
|
|
||||||
docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
|
||||||
else
|
|
||||||
docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
cleanup() {
|
|
||||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
|
||||||
docker stop "node$node"
|
|
||||||
done
|
|
||||||
docker network rm docker-net
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
start_network
|
|
||||||
start_nodes
|
|
||||||
run_nodes
|
|
||||||
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
|
||||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8010}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user