2025-07-04 21:49:18 +08:00
2024-01-14 12:37:58 -08:00
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
2024-04-30 10:41:59 -07:00
# Please update any changes made here to
2025-05-24 06:49:21 +08:00
# docs/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png
2024-04-30 10:41:59 -07:00
2025-04-29 19:08:04 -07:00
ARG CUDA_VERSION = 12 .8.1
2025-06-27 18:04:39 +02:00
ARG PYTHON_VERSION = 3 .12
# By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in
# private registries that use a different repository naming conventions.
#
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG BUILD_BASE_IMAGE = nvidia/cuda:${ CUDA_VERSION } -devel-ubuntu20.04
ARG FINAL_BASE_IMAGE = nvidia/cuda:${ CUDA_VERSION } -devel-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
# installation of the GPG key of the PPA, as done by add-apt-repository, so we
# also need a URL for the GPG key.
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
# The PyPA get-pip.py script is a self contained script+zip file, that provides
# both the installer script and the pip base85-encoded zip archive. This allows
# bootstrapping pip in environment where a dsitribution package does not exist.
#
# By parameterizing the URL for get-pip.py installation script, we allow
# third-party to use their own copy of the script stored in a private mirror.
# We set the default value to the PyPA owned get-pip.py script.
#
# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
ARG GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py"
# PIP supports fetching the packages from custom indexes, allowing third-party
# to host the packages in private mirrors. The PIP_INDEX_URL and
# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
# default indexes. By letting them empty by default, PIP will use its default
# indexes if the build process doesn't override the indexes.
#
# Uv uses different variables. We set them by default to the same values as
# PIP, but they can be overridden.
ARG PIP_INDEX_URL
ARG PIP_EXTRA_INDEX_URL
ARG UV_INDEX_URL = ${ PIP_INDEX_URL }
ARG UV_EXTRA_INDEX_URL = ${ PIP_EXTRA_INDEX_URL }
# PyTorch provides its own indexes for standard and nightly builds
ARG PYTORCH_CUDA_INDEX_BASE_URL = https://download.pytorch.org/whl
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL = https://download.pytorch.org/whl/nightly
# PIP supports multiple authentication schemes, including keyring
# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
# disabled by default, we allow third-party to use keyring authentication for
# their private Python indexes, while not changing the default behavior which
# is no authentication.
#
# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
ARG PIP_KEYRING_PROVIDER = disabled
ARG UV_KEYRING_PROVIDER = ${ PIP_KEYRING_PROVIDER }
2025-07-04 21:49:18 +08:00
# Flag enables build-in KV-connector dependency libs into docker images
ARG INSTALL_KV_CONNECTORS = false
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
# prepare basic build environment
2025-06-27 18:04:39 +02:00
FROM ${BUILD_BASE_IMAGE } AS base
ARG CUDA_VERSION
ARG PYTHON_VERSION
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-06-18 11:00:36 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2025-06-27 18:04:39 +02:00
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ DEADSNAKES_MIRROR_URL } ] ; then \
if [ ! -z " ${ DEADSNAKES_GPGKEY_URL } " ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
curl -L ${ DEADSNAKES_GPGKEY_URL } | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo " deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${ DEADSNAKES_MIRROR_URL } $( lsb_release -cs) main " > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo " Attempt $i failed, retrying in 5s... " ; sleep 5; } ; \
done ; \
fi \
2025-03-24 05:53:10 -07:00
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
2025-06-27 18:04:39 +02:00
&& curl -sS ${ GET_PIP_URL } | python${ PYTHON_VERSION } \
2025-03-24 05:53:10 -07:00
&& python3 --version && python3 -m pip --version
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
2025-03-24 05:53:10 -07:00
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2023-10-31 12:36:47 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-03-05 20:09:29 -05:00
2024-09-26 13:07:18 -04:00
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
2024-01-31 14:34:17 -08:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-01-31 14:34:17 -08:00
2023-10-31 12:36:47 -07:00
WORKDIR /workspace
2025-03-24 05:53:10 -07:00
# install build and runtime dependencies
2024-12-19 18:13:06 -08:00
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-06-27 18:04:39 +02:00
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
--pre pytorch_triton = = 3.3.0+gitab727c40; \
2024-12-16 17:20:49 +08:00
fi
2024-07-03 02:11:29 +03:00
2025-03-08 17:44:35 +01:00
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/cuda.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-12-19 18:13:06 -08:00
2024-04-04 10:26:19 -07:00
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
2025-06-26 08:23:56 +09:00
ARG torch_cuda_arch_list = '7.0 7.5 8.0 8.9 9.0 10.0 12.0'
2024-04-04 10:26:19 -07:00
ENV TORCH_CUDA_ARCH_LIST = ${ torch_cuda_arch_list }
2024-09-21 02:27:10 -04:00
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches = '80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES = ${ vllm_fa_cmake_gpu_arches }
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
#################### WHEEL BUILD IMAGE ####################
2024-06-18 11:00:36 -07:00
FROM base AS build
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2024-06-18 11:00:36 -07:00
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
2023-11-30 15:06:50 +08:00
# install build dependencies
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2024-06-18 11:00:36 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-03-05 20:09:29 -05:00
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/build.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2023-11-30 15:06:50 +08:00
2024-10-14 20:34:47 +02:00
COPY . .
2024-10-17 19:25:06 +02:00
ARG GIT_REPO_CHECK = 0
RUN --mount= type = bind,source= .git,target= .git \
2025-03-06 16:08:36 -05:00
if [ " $GIT_REPO_CHECK " != "0" ] ; then bash tools/check_repo.sh ; fi
2023-10-31 12:36:47 -07:00
# max jobs used by Ninja to build extensions
2023-12-08 09:53:47 -08:00
ARG max_jobs = 2
ENV MAX_JOBS = ${ max_jobs }
2023-12-07 16:00:32 -03:00
# number of threads used by nvcc
ARG nvcc_threads = 8
ENV NVCC_THREADS = $nvcc_threads
2023-12-08 09:53:47 -08:00
2024-06-12 17:58:12 -07:00
ARG USE_SCCACHE
2025-06-27 18:04:39 +02:00
ARG SCCACHE_DOWNLOAD_URL = https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
2024-08-22 13:10:55 -07:00
ARG SCCACHE_BUCKET_NAME = vllm-build-sccache
ARG SCCACHE_REGION_NAME = us-west-2
2024-09-16 15:11:27 -07:00
ARG SCCACHE_S3_NO_CREDENTIALS = 0
2024-06-12 17:58:12 -07:00
# if USE_SCCACHE is set, use sccache to speed up compilation
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " = "1" ] ; then \
echo "Installing sccache..." \
2025-06-27 18:04:39 +02:00
&& curl -L -o sccache.tar.gz ${ SCCACHE_DOWNLOAD_URL } \
2024-06-12 17:58:12 -07:00
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ SCCACHE_ENDPOINT } ] ; then export SCCACHE_ENDPOINT = ${ SCCACHE_ENDPOINT } ; fi \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_BUCKET = ${ SCCACHE_BUCKET_NAME } \
&& export SCCACHE_REGION = ${ SCCACHE_REGION_NAME } \
2024-09-16 15:11:27 -07:00
&& export SCCACHE_S3_NO_CREDENTIALS = ${ SCCACHE_S3_NO_CREDENTIALS } \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_IDLE_TIMEOUT = 0 \
2024-07-05 17:19:53 -07:00
&& export CMAKE_BUILD_TYPE = Release \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats \
2024-07-14 21:54:46 -04:00
&& python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38 \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats; \
fi
2024-03-28 22:14:24 -07:00
ENV CCACHE_DIR = /root/.cache/ccache
RUN --mount= type = cache,target= /root/.cache/ccache \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " != "1" ] ; then \
2025-03-06 16:08:36 -05:00
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
2024-07-14 21:54:46 -04:00
python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38; \
2024-06-12 17:58:12 -07:00
fi
2024-04-04 21:53:16 -07:00
2024-09-04 14:17:05 +08:00
# Check the size of the wheel if RUN_WHEEL_CHECK is true
2024-05-04 13:44:36 -07:00
COPY .buildkite/check-wheel-size.py check-wheel-size.py
2025-01-24 17:54:29 +08:00
# sync the default value with .buildkite/check-wheel-size.py
2025-02-03 15:59:49 +08:00
ARG VLLM_MAX_SIZE_MB = 400
2024-09-04 14:17:05 +08:00
ENV VLLM_MAX_SIZE_MB = $VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK = true
RUN if [ " $RUN_WHEEL_CHECK " = "true" ] ; then \
python3 check-wheel-size.py dist; \
else \
echo "Skipping wheel size check." ; \
fi
2024-01-14 12:37:58 -08:00
#################### EXTENSION Build IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
FROM base as dev
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
# Workaround for #17068
RUN --mount= type = cache,target= /root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
2025-03-05 20:09:29 -05:00
2025-03-08 17:44:35 +01:00
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/dev.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
2024-12-16 17:20:49 +08:00
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
# image with vLLM installed
2025-01-28 02:19:24 +08:00
# TODO: Restore to base image after FlashInfer AOT wheel fixed
2025-06-27 18:04:39 +02:00
FROM ${FINAL_BASE_IMAGE } AS vllm-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-01-14 12:37:58 -08:00
WORKDIR /vllm-workspace
2024-08-22 13:10:55 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2025-05-21 22:32:06 +08:00
SHELL [ "/bin/bash" , "-c" ]
2025-06-27 18:04:39 +02:00
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
2024-08-22 13:10:55 -07:00
RUN PYTHON_VERSION_STR = $( echo ${ PYTHON_VERSION } | sed 's/\.//g' ) && \
echo " export PYTHON_VERSION_STR= ${ PYTHON_VERSION_STR } " >> /etc/environment
2024-04-04 21:53:16 -07:00
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ DEADSNAKES_MIRROR_URL } ] ; then \
if [ ! -z " ${ DEADSNAKES_GPGKEY_URL } " ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
curl -L ${ DEADSNAKES_GPGKEY_URL } | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo " deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${ DEADSNAKES_MIRROR_URL } $( lsb_release -cs) main " > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo " Attempt $i failed, retrying in 5s... " ; sleep 5; } ; \
done ; \
fi \
2025-03-24 05:53:10 -07:00
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
2025-06-27 18:04:39 +02:00
&& curl -sS ${ GET_PIP_URL } | python${ PYTHON_VERSION } \
2025-03-24 05:53:10 -07:00
&& python3 --version && python3 -m pip --version
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
2025-03-24 05:53:10 -07:00
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2024-04-04 21:53:16 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-03-05 20:09:29 -05:00
2024-04-04 21:53:16 -07:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-04-04 21:53:16 -07:00
2024-12-19 18:13:06 -08:00
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-19 18:13:06 -08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-06-27 18:04:39 +02:00
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
--pre pytorch_triton = = 3.3.0+gitab727c40 ; \
2024-12-19 18:13:06 -08:00
fi
2024-12-16 17:20:49 +08:00
# Install vllm wheel first, so that torch etc will be installed.
2024-04-04 21:53:16 -07:00
RUN --mount= type = bind,from= build,src= /workspace/dist,target= /vllm-workspace/dist \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system dist/*.whl --verbose \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-07-03 02:11:29 +03:00
2025-02-15 21:33:13 +08:00
# If we need to build FlashInfer wheel before its release:
2025-01-28 02:19:24 +08:00
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
2025-06-26 08:23:56 +09:00
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
2025-01-28 02:19:24 +08:00
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
2025-06-11 10:57:28 -04:00
# $ git checkout v0.2.6.post1
# $ python -m flashinfer.aot
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
2025-01-28 02:19:24 +08:00
2025-06-27 18:04:39 +02:00
# Allow specifying a version, Git revision or local .whl file
ARG FLASHINFER_CUDA128_INDEX_URL = "https://download.pytorch.org/whl/cu128/flashinfer"
ARG FLASHINFER_CUDA128_WHEEL = "flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
ARG FLASHINFER_GIT_REPO = "https://github.com/flashinfer-ai/flashinfer.git"
ARG FLASHINFER_GIT_REF = "v0.2.6.post1"
2025-07-02 20:15:11 -04:00
RUN --mount= type = cache,target= /root/.cache/uv bash - <<'BASH'
. /etc/environment
if [ " $TARGETPLATFORM " != "linux/arm64" ] ; then
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
if [ [ " $CUDA_VERSION " = = 12.8* ] ] ; then
uv pip install --system ${ FLASHINFER_CUDA128_INDEX_URL } /${ FLASHINFER_CUDA128_WHEEL }
else
export TORCH_CUDA_ARCH_LIST = '7.5 8.0 8.9 9.0a 10.0a 12.0'
git clone ${ FLASHINFER_GIT_REPO } --single-branch --branch ${ FLASHINFER_GIT_REF } --recursive
# Needed to build AOT kernels
( cd flashinfer && \
python3 -m flashinfer.aot && \
uv pip install --system --no-build-isolation . \
)
rm -rf flashinfer
# Default arches (skipping 10.0a and 12.0 since these need 12.8)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
TORCH_CUDA_ARCH_LIST = "7.5 8.0 8.9 9.0a"
if [ [ " ${ CUDA_VERSION } " = = 11.* ] ] ; then
TORCH_CUDA_ARCH_LIST = "7.5 8.0 8.9"
fi
echo " 🏗️ Building FlashInfer for arches: ${ TORCH_CUDA_ARCH_LIST } "
git clone --depth 1 --recursive --shallow-submodules \
--branch v0.2.6.post1 \
https://github.com/flashinfer-ai/flashinfer.git flashinfer
pushd flashinfer
python3 -m flashinfer.aot
TORCH_CUDA_ARCH_LIST = " ${ TORCH_CUDA_ARCH_LIST } " \
uv pip install --system --no-build-isolation .
popd
rm -rf flashinfer
fi \
fi
BASH
2024-10-09 00:37:34 +08:00
COPY examples examples
2025-04-16 12:21:14 +08:00
COPY benchmarks benchmarks
2025-04-18 13:13:35 +08:00
COPY ./vllm/collect_env.py .
2025-01-28 02:19:24 +08:00
2025-04-29 19:08:04 -07:00
RUN --mount= type = cache,target= /root/.cache/uv \
. /etc/environment && \
uv pip list
2025-05-22 15:13:54 -04:00
# Even when we build Flashinfer with AOT mode, there's still
2025-01-28 02:19:24 +08:00
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/build.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2025-01-28 02:19:24 +08:00
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
ADD . /vllm-workspace/
2023-10-31 12:36:47 -07:00
2025-06-27 18:04:39 +02:00
ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-03-05 20:09:29 -05:00
2025-04-24 08:14:18 +09:00
# Workaround for #17068
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing)
2025-06-27 18:04:39 +02:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-05-22 15:13:54 -04:00
CUDA_MAJOR = " ${ CUDA_VERSION %%.* } " ; \
if [ " $CUDA_MAJOR " -ge 12 ] ; then \
uv pip install --system -r requirements/dev.txt; \
fi
2024-03-14 18:55:54 +01:00
2024-11-26 00:20:04 -08:00
# install development dependencies (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -e tests/vllm_test_utils
2024-11-26 00:20:04 -08:00
2024-11-08 03:35:25 -05:00
# enable fast downloads from hf (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system hf_transfer
2024-11-08 03:35:25 -05:00
ENV HF_HUB_ENABLE_HF_TRANSFER 1
2024-11-06 12:57:35 -07:00
# Copy in the v1 package for testing (it isn't distributed yet)
2025-06-27 18:04:39 +02:00
COPY vllm/v1 /usr/local/lib/python${ PYTHON_VERSION } /dist-packages/vllm/v1
2024-11-06 12:57:35 -07:00
2024-04-04 21:53:16 -07:00
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
2025-05-23 11:09:53 +02:00
RUN cp -r examples test_docs/
2024-04-04 21:53:16 -07:00
RUN mv vllm test_docs/
2025-05-23 11:09:53 +02:00
RUN mv mkdocs.yaml test_docs/
2024-04-04 21:53:16 -07:00
#################### TEST IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################
2025-01-03 10:59:25 +11:00
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
2025-04-28 04:38:14 +02:00
ARG TARGETPLATFORM
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-04-04 21:53:16 -07:00
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-07-04 21:49:18 +08:00
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
2023-12-12 17:34:17 -08:00
# install additional dependencies for openai api server
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-07-04 21:49:18 +08:00
if [ " $INSTALL_KV_CONNECTORS " = "true" ] ; then \
uv pip install --system -r requirements/kv_connectors.txt; \
fi ; \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-03-24 05:53:10 -07:00
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[ s3] ; \
2024-12-16 17:20:49 +08:00
else \
2025-07-03 21:58:46 +08:00
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[ s3] ; \
2024-12-16 17:20:49 +08:00
fi
2024-12-19 18:13:06 -08:00
2024-03-28 22:16:12 -07:00
ENV VLLM_USAGE_SOURCE production-docker-image
2025-01-03 10:59:25 +11:00
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
2025-01-08 13:09:53 +00:00
COPY examples/online_serving/sagemaker-entrypoint.sh .
2025-01-03 10:59:25 +11:00
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT [ "./sagemaker-entrypoint.sh" ]
FROM vllm-openai-base AS vllm-openai
2023-10-31 12:36:47 -07:00
ENTRYPOINT [ "python3" , "-m" , "vllm.entrypoints.openai.api_server" ]
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################