Compare commits
3 Commits
v0.14.0rc2
...
v0.14.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b17039bccc | ||
|
|
48b67ba75f | ||
|
|
09f4264a55 |
@@ -1,6 +1,6 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
@@ -11,11 +11,11 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
@@ -26,12 +26,12 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
- label: "Build wheel - aarch64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
@@ -40,39 +40,39 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
- label: "Build wheel - x86_64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
@@ -81,12 +81,12 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
# Build release images (CUDA 12.9)
|
||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
@@ -99,7 +99,7 @@ steps:
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image (arm64)"
|
||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
@@ -109,34 +109,92 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
- label: "Create multi-arch manifest"
|
||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
- label: "Annotate release workflow - CUDA 12.9"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- block: "Build CUDA 13.0 release images"
|
||||
key: block-release-image-build-cuda-13-0
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||
depends_on: block-release-image-build-cuda-13-0
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||
depends_on: block-release-image-build-cuda-13-0
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- build-wheel-x86-cuda-12-9
|
||||
- build-wheel-x86-cuda-13-0
|
||||
- build-wheel-x86-cpu
|
||||
- build-wheel-arm64-cuda-12-9
|
||||
- build-wheel-arm64-cuda-13-0
|
||||
- build-wheel-arm64-cpu
|
||||
|
||||
- label: "Upload release wheels to PyPI and GitHub"
|
||||
depends_on:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels.sh"
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
@@ -192,7 +250,7 @@ steps:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
|
||||
103
.buildkite/scripts/upload-release-wheels.sh
Normal file
103
.buildkite/scripts/upload-release-wheels.sh
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||
if [ -z "$GIT_VERSION" ]; then
|
||||
echo "[FATAL] Not on a git tag, cannot create release."
|
||||
exit 1
|
||||
else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# check pypi token
|
||||
if [ -z "$PYPI_TOKEN" ]; then
|
||||
echo "[FATAL] PYPI_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export TWINE_USERNAME="__token__"
|
||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||
fi
|
||||
|
||||
# check github token
|
||||
if [ -z "$GITHUB_TOKEN" ]; then
|
||||
echo "[FATAL] GITHUB_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export GH_TOKEN="$GITHUB_TOKEN"
|
||||
fi
|
||||
|
||||
set -x # avoid printing secrets above
|
||||
|
||||
# download gh CLI from github
|
||||
# Get latest gh CLI version from GitHub API
|
||||
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
|
||||
if [ -z "$GH_VERSION" ]; then
|
||||
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading gh CLI version: $GH_VERSION"
|
||||
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
|
||||
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
|
||||
GH_INSTALL_DIR="/tmp/gh-install"
|
||||
mkdir -p "$GH_INSTALL_DIR"
|
||||
pushd "$GH_INSTALL_DIR"
|
||||
curl -L -o "$GH_TARBALL" "$GH_URL"
|
||||
tar -xzf "$GH_TARBALL"
|
||||
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
|
||||
if [ -z "$GH_BIN" ]; then
|
||||
echo "[FATAL] Failed to find gh CLI executable"
|
||||
exit 1
|
||||
fi
|
||||
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
|
||||
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
|
||||
command "$GH_BIN" release list --limit 5
|
||||
popd
|
||||
|
||||
# install twine from pypi
|
||||
python3 -m venv /tmp/vllm-release-env
|
||||
source /tmp/vllm-release-env/bin/activate
|
||||
pip install twine
|
||||
python3 -m twine --version
|
||||
|
||||
# copy release wheels to local directory
|
||||
DIST_DIR=/tmp/vllm-release-dist
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
ls -la $DIST_DIR
|
||||
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
|
||||
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
fi
|
||||
python3 -m twine check $PYPI_WHEEL_FILES
|
||||
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
|
||||
echo "Wheels uploaded to PyPI"
|
||||
|
||||
# create release on GitHub with the release version and all wheels
|
||||
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
|
||||
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
|
||||
FROM base AS fetch_vllm_1
|
||||
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
||||
ARG VLLM_BRANCH="main"
|
||||
ENV VLLM_REPO=${VLLM_REPO}
|
||||
ENV VLLM_BRANCH=${VLLM_BRANCH}
|
||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
&& pip uninstall -y vllm \
|
||||
&& uv pip install --system *.whl
|
||||
|
||||
# Install RIXL wheel
|
||||
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
|
||||
uv pip install --system /rixl_install/*.whl
|
||||
|
||||
WORKDIR /vllm-workspace
|
||||
ARG COMMON_WORKDIR
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||
|
||||
@@ -198,92 +198,6 @@ RUN cd mori \
|
||||
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
|
||||
|
||||
|
||||
###
|
||||
### RIXL Build
|
||||
###
|
||||
FROM build_pytorch AS build_rixl
|
||||
ARG RIXL_BRANCH
|
||||
ARG RIXL_REPO
|
||||
ARG ETCD_BRANCH
|
||||
ARG ETCD_REPO
|
||||
ARG UCX_BRANCH
|
||||
ARG UCX_REPO
|
||||
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV UCX_HOME=/usr/local/ucx
|
||||
ENV RIXL_HOME=/usr/local/rixl
|
||||
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
|
||||
|
||||
# RIXL build system dependences and RDMA support
|
||||
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
|
||||
libgrpc-dev \
|
||||
libgrpc++-dev \
|
||||
libprotobuf-dev \
|
||||
protobuf-compiler-grpc \
|
||||
libcpprest-dev \
|
||||
libaio-dev \
|
||||
librdmacm1 \
|
||||
librdmacm-dev \
|
||||
libibverbs1 \
|
||||
libibverbs-dev \
|
||||
ibverbs-utils \
|
||||
rdmacm-utils \
|
||||
ibverbs-providers
|
||||
|
||||
RUN pip install meson auditwheel patchelf tomlkit
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone ${ETCD_REPO} && \
|
||||
cd etcd-cpp-apiv3 && \
|
||||
git checkout ${ETCD_BRANCH} && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
|
||||
make -j$(nproc) && \
|
||||
make install
|
||||
|
||||
RUN cd /usr/local/src && \
|
||||
git clone ${UCX_REPO} && \
|
||||
cd ucx && \
|
||||
git checkout ${UCX_BRANCH} && \
|
||||
./autogen.sh && \
|
||||
mkdir build && cd build && \
|
||||
../configure \
|
||||
--prefix=/usr/local/ucx \
|
||||
--enable-shared \
|
||||
--disable-static \
|
||||
--disable-doxygen-doc \
|
||||
--enable-optimizations \
|
||||
--enable-devel-headers \
|
||||
--with-rocm=/opt/rocm \
|
||||
--with-verbs \
|
||||
--with-dm \
|
||||
--enable-mt && \
|
||||
make -j && \
|
||||
make -j install
|
||||
|
||||
ENV PATH=/usr/local/ucx/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
RUN git clone ${RIXL_REPO} /opt/rixl && \
|
||||
cd /opt/rixl && \
|
||||
git checkout ${RIXL_BRANCH} && \
|
||||
meson setup build --prefix=${RIXL_HOME} \
|
||||
-Ducx_path=${UCX_HOME} \
|
||||
-Drocm_path=${ROCM_PATH} && \
|
||||
cd build && \
|
||||
ninja && \
|
||||
ninja install
|
||||
|
||||
# Generate RIXL wheel
|
||||
RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||
./contrib/build-wheel.sh \
|
||||
--output-dir /app/install \
|
||||
--rocm-dir ${ROCM_PATH} \
|
||||
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
||||
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
||||
|
||||
|
||||
###
|
||||
### FlashAttention Build
|
||||
###
|
||||
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
|
||||
@@ -385,12 +297,6 @@ ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
ARG RIXL_BRANCH
|
||||
ARG RIXL_REPO
|
||||
ARG ETCD_BRANCH
|
||||
ARG ETCD_REPO
|
||||
ARG UCX_BRANCH
|
||||
ARG UCX_REPO
|
||||
ARG MORI_BRANCH
|
||||
ARG MORI_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
|
||||
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
|
||||
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
|
||||
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
|
||||
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
|
||||
|
||||
@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
||||
|
||||
try:
|
||||
generator = await handler.create_completion(request, raw_request)
|
||||
except OverflowError as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
|
||||
@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
|
||||
construct_input_messages,
|
||||
)
|
||||
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
|
||||
from vllm.inputs.data import PromptType, TokensPrompt
|
||||
from vllm.inputs.parse import (
|
||||
PromptComponents,
|
||||
@@ -760,11 +760,15 @@ class OpenAIServing:
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = exc.parameter
|
||||
elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
|
||||
elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
|
||||
# Common validation errors from user input
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = None
|
||||
elif isinstance(exc, NotImplementedError):
|
||||
err_type = "NotImplementedError"
|
||||
status_code = HTTPStatus.NOT_IMPLEMENTED
|
||||
param = None
|
||||
elif exc.__class__.__name__ == "TemplateError":
|
||||
# jinja2.TemplateError (avoid importing jinja2)
|
||||
err_type = "BadRequestError"
|
||||
@@ -783,9 +787,10 @@ class OpenAIServing:
|
||||
traceback.print_exc()
|
||||
else:
|
||||
traceback.print_stack()
|
||||
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(
|
||||
message=message,
|
||||
message=sanitize_message(message),
|
||||
type=err_type,
|
||||
code=status_code.value,
|
||||
param=param,
|
||||
|
||||
@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ModelPermission,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.utils import sanitize_message
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
@@ -300,5 +301,9 @@ def create_error_response(
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||
) -> ErrorResponse:
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(message=message, type=err_type, code=status_code.value)
|
||||
error=ErrorInfo(
|
||||
message=sanitize_message(message),
|
||||
type=err_type,
|
||||
code=status_code.value,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from starlette.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
|
||||
try:
|
||||
generator = await handler.create_classify(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
@@ -47,9 +47,7 @@ async def create_embedding(
|
||||
try:
|
||||
generator = await handler.create_embedding(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
|
||||
try:
|
||||
generator = await handler.create_pooling(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing_extensions import assert_never
|
||||
|
||||
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
|
||||
try:
|
||||
generator = await handler.create_score(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
|
||||
try:
|
||||
generator = await handler.do_rerank(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
|
||||
@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
|
||||
try:
|
||||
generator = await handler.serve_tokens(request, raw_request)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
content=generator.model_dump(), status_code=generator.error.code
|
||||
|
||||
@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
|
||||
|
||||
try:
|
||||
generator = await handler.create_tokenize(request, raw_request)
|
||||
except NotImplementedError as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
||||
) from e
|
||||
return handler.create_error_response(e)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
|
||||
@@ -7,7 +7,7 @@ import functools
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import regex as re
|
||||
from fastapi import Request
|
||||
@@ -22,17 +22,24 @@ from vllm.entrypoints.chat_utils import (
|
||||
resolve_hf_chat_template,
|
||||
resolve_mistral_chat_template,
|
||||
)
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
CompletionRequest,
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
ChatCompletionRequest = object
|
||||
CompletionRequest = object
|
||||
StreamOptions = object
|
||||
LoRAModulePath = object
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -206,7 +213,7 @@ def _validate_truncation_size(
|
||||
|
||||
def get_max_tokens(
|
||||
max_model_len: int,
|
||||
request: ChatCompletionRequest | CompletionRequest,
|
||||
request: "ChatCompletionRequest | CompletionRequest",
|
||||
input_length: int,
|
||||
default_sampling_params: dict,
|
||||
) -> int:
|
||||
@@ -227,6 +234,8 @@ def get_max_tokens(
|
||||
|
||||
|
||||
def log_non_default_args(args: Namespace | EngineArgs):
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
|
||||
non_default_args = {}
|
||||
|
||||
# Handle Namespace
|
||||
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
|
||||
|
||||
|
||||
def should_include_usage(
|
||||
stream_options: StreamOptions | None, enable_force_include_usage: bool
|
||||
stream_options: "StreamOptions | None", enable_force_include_usage: bool
|
||||
) -> tuple[bool, bool]:
|
||||
if stream_options:
|
||||
include_usage = stream_options.include_usage or enable_force_include_usage
|
||||
@@ -270,6 +279,8 @@ def should_include_usage(
|
||||
def process_lora_modules(
|
||||
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
|
||||
) -> list[LoRAModulePath]:
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
|
||||
lora_modules = args_lora_modules
|
||||
if default_mm_loras:
|
||||
default_mm_lora_paths = [
|
||||
|
||||
Reference in New Issue
Block a user