Compare commits

..

3 Commits

Author SHA1 Message Date
Shengqi Chen
b17039bccc [CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032)
(cherry picked from commit 8e61425ee6)
2026-01-16 21:04:48 -08:00
Cyrus Leung
48b67ba75f [Frontend] Standardize use of create_error_response (#32319)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-16 11:35:10 +00:00
TJian
09f4264a55 [Bugfix] Fix ROCm dockerfiles (#32447)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-16 10:50:00 +08:00
15 changed files with 242 additions and 174 deletions

View File

@@ -1,6 +1,6 @@
steps: steps:
# aarch64 + CUDA builds # aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9" - label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-12-9 id: build-wheel-arm64-cuda-12-9
agents: agents:
@@ -11,11 +11,11 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-nightly-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build arm64 wheel - CUDA 13.0" - label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-13-0 id: build-wheel-arm64-cuda-13-0
agents: agents:
@@ -26,12 +26,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# aarch64 build # aarch64 build
- label: "Build arm64 CPU wheel" - label: "Build wheel - aarch64 - CPU"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cpu id: build-wheel-arm64-cpu
agents: agents:
@@ -40,39 +40,39 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 + CUDA builds # x86 + CUDA builds
- label: "Build wheel - CUDA 12.9" - label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-9 id: build-wheel-x86-cuda-12-9
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0" - label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-13-0 id: build-wheel-x86-cuda-13-0
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 CPU wheel build # x86 CPU wheel build
- label: "Build x86 CPU wheel" - label: "Build wheel - x86_64 - CPU"
depends_on: ~ depends_on: ~
id: build-wheel-x86-cpu id: build-wheel-x86-cpu
agents: agents:
@@ -81,12 +81,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# Build release images (12.9) # Build release images (CUDA 12.9)
- label: "Build release image (x86)" - label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-release-image-x86 id: build-release-image-x86
agents: agents:
@@ -99,7 +99,7 @@ steps:
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image (arm64)" - label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-release-image-arm64 id: build-release-image-arm64
agents: agents:
@@ -109,34 +109,92 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest - label: "Create multi-arch manifest - CUDA 12.9"
- label: "Create multi-arch manifest"
depends_on: depends_on:
- build-release-image-x86 - build-release-image-x86
- build-release-image-arm64 - build-release-image-arm64
id: create-multi-arch-manifest id: create-multi-arch-manifest
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow" - label: "Annotate release workflow - CUDA 12.9"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
id: annotate-release-workflow id: annotate-release-workflow
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "bash .buildkite/scripts/annotate-release.sh" - "bash .buildkite/scripts/annotate-release.sh"
- block: "Build CUDA 13.0 release images"
key: block-release-image-build-cuda-13-0
depends_on: ~
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: release-version
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheel-x86-cuda-12-9
- build-wheel-x86-cuda-13-0
- build-wheel-x86-cpu
- build-wheel-arm64-cuda-12-9
- build-wheel-arm64-cuda-13-0
- build-wheel-arm64-cpu
- label: "Upload release wheels to PyPI and GitHub"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build
depends_on: ~ depends_on: ~
@@ -192,7 +250,7 @@ steps:
- create-multi-arch-manifest - create-multi-arch-manifest
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl

View File

@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1 FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main" ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \ ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \ && cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& uv pip install --system *.whl && uv pip install --system *.whl
# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace

View File

@@ -198,92 +198,6 @@ RUN cd mori \
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
###
### RIXL Build
###
FROM build_pytorch AS build_rixl
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers
RUN pip install meson auditwheel patchelf tomlkit
WORKDIR /workspace
RUN git clone ${ETCD_REPO} && \
cd etcd-cpp-apiv3 && \
git checkout ${ETCD_BRANCH} && \
mkdir build && cd build && \
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
make -j$(nproc) && \
make install
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make -j install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
### ###
### FlashAttention Build ### FlashAttention Build
### ###
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \ RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -385,12 +297,6 @@ ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH ARG AITER_BRANCH
ARG AITER_REPO ARG AITER_REPO
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ARG MORI_BRANCH ARG MORI_BRANCH
ARG MORI_REPO ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \ && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt

View File

@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
try: try:
generator = await handler.create_completion(request, raw_request) generator = await handler.create_completion(request, raw_request)
except OverflowError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
) from e
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
) )
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
from vllm.entrypoints.utils import _validate_truncation_size from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import ( from vllm.inputs.parse import (
PromptComponents, PromptComponents,
@@ -760,11 +760,15 @@ class OpenAIServing:
err_type = "BadRequestError" err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter param = exc.parameter
elif isinstance(exc, (ValueError, TypeError, RuntimeError)): elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
# Common validation errors from user input # Common validation errors from user input
err_type = "BadRequestError" err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST status_code = HTTPStatus.BAD_REQUEST
param = None param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif exc.__class__.__name__ == "TemplateError": elif exc.__class__.__name__ == "TemplateError":
# jinja2.TemplateError (avoid importing jinja2) # jinja2.TemplateError (avoid importing jinja2)
err_type = "BadRequestError" err_type = "BadRequestError"
@@ -783,9 +787,10 @@ class OpenAIServing:
traceback.print_exc() traceback.print_exc()
else: else:
traceback.print_stack() traceback.print_stack()
return ErrorResponse( return ErrorResponse(
error=ErrorInfo( error=ErrorInfo(
message=message, message=sanitize_message(message),
type=err_type, type=err_type,
code=status_code.value, code=status_code.value,
param=param, param=param,

View File

@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
ModelPermission, ModelPermission,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
) )
from vllm.entrypoints.utils import sanitize_message
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
@@ -300,5 +301,9 @@ def create_error_response(
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse: ) -> ErrorResponse:
return ErrorResponse( return ErrorResponse(
error=ErrorInfo(message=message, type=err_type, code=status_code.value) error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
) )

View File

@@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
try: try:
generator = await handler.create_classify(request, raw_request) generator = await handler.create_classify(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -47,9 +47,7 @@ async def create_embedding(
try: try:
generator = await handler.create_embedding(request, raw_request) generator = await handler.create_embedding(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
try: try:
generator = await handler.create_pooling(request, raw_request) generator = await handler.create_pooling(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
try: try:
generator = await handler.create_score(request, raw_request) generator = await handler.create_score(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
try: try:
generator = await handler.do_rerank(request, raw_request) generator = await handler.do_rerank(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
try: try:
generator = await handler.serve_tokens(request, raw_request) generator = await handler.serve_tokens(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
try: try:
generator = await handler.create_tokenize(request, raw_request) generator = await handler.create_tokenize(request, raw_request)
except NotImplementedError as e:
raise HTTPException(
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
) from e
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -7,7 +7,7 @@ import functools
import os import os
from argparse import Namespace from argparse import Namespace
from pathlib import Path from pathlib import Path
from typing import Any from typing import TYPE_CHECKING, Any
import regex as re import regex as re
from fastapi import Request from fastapi import Request
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
resolve_hf_chat_template, resolve_hf_chat_template,
resolve_mistral_chat_template, resolve_mistral_chat_template,
) )
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
if TYPE_CHECKING:
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
else:
ChatCompletionRequest = object
CompletionRequest = object
StreamOptions = object
LoRAModulePath = object
logger = init_logger(__name__) logger = init_logger(__name__)
VLLM_SUBCMD_PARSER_EPILOG = ( VLLM_SUBCMD_PARSER_EPILOG = (
@@ -206,7 +213,7 @@ def _validate_truncation_size(
def get_max_tokens( def get_max_tokens(
max_model_len: int, max_model_len: int,
request: ChatCompletionRequest | CompletionRequest, request: "ChatCompletionRequest | CompletionRequest",
input_length: int, input_length: int,
default_sampling_params: dict, default_sampling_params: dict,
) -> int: ) -> int:
@@ -227,6 +234,8 @@ def get_max_tokens(
def log_non_default_args(args: Namespace | EngineArgs): def log_non_default_args(args: Namespace | EngineArgs):
from vllm.entrypoints.openai.cli_args import make_arg_parser
non_default_args = {} non_default_args = {}
# Handle Namespace # Handle Namespace
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
def should_include_usage( def should_include_usage(
stream_options: StreamOptions | None, enable_force_include_usage: bool stream_options: "StreamOptions | None", enable_force_include_usage: bool
) -> tuple[bool, bool]: ) -> tuple[bool, bool]:
if stream_options: if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage include_usage = stream_options.include_usage or enable_force_include_usage
@@ -270,6 +279,8 @@ def should_include_usage(
def process_lora_modules( def process_lora_modules(
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
) -> list[LoRAModulePath]: ) -> list[LoRAModulePath]:
from vllm.entrypoints.openai.serving_models import LoRAModulePath
lora_modules = args_lora_modules lora_modules = args_lora_modules
if default_mm_loras: if default_mm_loras:
default_mm_lora_paths = [ default_mm_lora_paths = [