2024-01-14 12:37:58 -08:00
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
2024-04-30 10:41:59 -07:00
# Please update any changes made here to
2025-05-24 06:49:21 +08:00
# docs/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png
2024-04-30 10:41:59 -07:00
2025-04-29 19:08:04 -07:00
ARG CUDA_VERSION = 12 .8.1
2025-06-27 18:04:39 +02:00
ARG PYTHON_VERSION = 3 .12
# By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in
# private registries that use a different repository naming conventions.
#
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG BUILD_BASE_IMAGE = nvidia/cuda:${ CUDA_VERSION } -devel-ubuntu20.04
2025-07-31 21:01:55 -04:00
# TODO: Restore to base image after FlashInfer AOT wheel fixed
2025-06-27 18:04:39 +02:00
ARG FINAL_BASE_IMAGE = nvidia/cuda:${ CUDA_VERSION } -devel-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
# installation of the GPG key of the PPA, as done by add-apt-repository, so we
# also need a URL for the GPG key.
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
# The PyPA get-pip.py script is a self contained script+zip file, that provides
# both the installer script and the pip base85-encoded zip archive. This allows
# bootstrapping pip in environment where a dsitribution package does not exist.
#
# By parameterizing the URL for get-pip.py installation script, we allow
# third-party to use their own copy of the script stored in a private mirror.
# We set the default value to the PyPA owned get-pip.py script.
#
# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
ARG GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py"
# PIP supports fetching the packages from custom indexes, allowing third-party
# to host the packages in private mirrors. The PIP_INDEX_URL and
# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
# default indexes. By letting them empty by default, PIP will use its default
# indexes if the build process doesn't override the indexes.
#
# Uv uses different variables. We set them by default to the same values as
# PIP, but they can be overridden.
ARG PIP_INDEX_URL
ARG PIP_EXTRA_INDEX_URL
ARG UV_INDEX_URL = ${ PIP_INDEX_URL }
ARG UV_EXTRA_INDEX_URL = ${ PIP_EXTRA_INDEX_URL }
# PyTorch provides its own indexes for standard and nightly builds
ARG PYTORCH_CUDA_INDEX_BASE_URL = https://download.pytorch.org/whl
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL = https://download.pytorch.org/whl/nightly
# PIP supports multiple authentication schemes, including keyring
# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
# disabled by default, we allow third-party to use keyring authentication for
# their private Python indexes, while not changing the default behavior which
# is no authentication.
#
# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
ARG PIP_KEYRING_PROVIDER = disabled
ARG UV_KEYRING_PROVIDER = ${ PIP_KEYRING_PROVIDER }
2025-07-16 12:12:40 +08:00
# Flag enables built-in KV-connector dependency libs into docker images
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
# prepare basic build environment
2025-06-27 18:04:39 +02:00
FROM ${BUILD_BASE_IMAGE } AS base
ARG CUDA_VERSION
ARG PYTHON_VERSION
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-06-18 11:00:36 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2025-06-27 18:04:39 +02:00
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ DEADSNAKES_MIRROR_URL } ] ; then \
if [ ! -z " ${ DEADSNAKES_GPGKEY_URL } " ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
curl -L ${ DEADSNAKES_GPGKEY_URL } | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo " deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${ DEADSNAKES_MIRROR_URL } $( lsb_release -cs) main " > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo " Attempt $i failed, retrying in 5s... " ; sleep 5; } ; \
done ; \
fi \
2025-03-24 05:53:10 -07:00
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
2025-06-27 18:04:39 +02:00
&& curl -sS ${ GET_PIP_URL } | python${ PYTHON_VERSION } \
2025-03-24 05:53:10 -07:00
&& python3 --version && python3 -m pip --version
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
2025-03-24 05:53:10 -07:00
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2023-10-31 12:36:47 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-08-05 09:57:10 -04:00
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE = copy
2025-03-05 20:09:29 -05:00
2024-09-26 13:07:18 -04:00
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
2024-01-31 14:34:17 -08:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-01-31 14:34:17 -08:00
2023-10-31 12:36:47 -07:00
WORKDIR /workspace
2025-03-24 05:53:10 -07:00
# install build and runtime dependencies
2025-03-08 17:44:35 +01:00
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/cuda.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-12-19 18:13:06 -08:00
2024-04-04 10:26:19 -07:00
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
2025-06-26 08:23:56 +09:00
ARG torch_cuda_arch_list = '7.0 7.5 8.0 8.9 9.0 10.0 12.0'
2024-04-04 10:26:19 -07:00
ENV TORCH_CUDA_ARCH_LIST = ${ torch_cuda_arch_list }
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
#################### WHEEL BUILD IMAGE ####################
2024-06-18 11:00:36 -07:00
FROM base AS build
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2024-06-18 11:00:36 -07:00
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
2023-11-30 15:06:50 +08:00
# install build dependencies
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2024-06-18 11:00:36 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-08-05 09:57:10 -04:00
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE = copy
2025-03-05 20:09:29 -05:00
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/build.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2023-11-30 15:06:50 +08:00
2024-10-14 20:34:47 +02:00
COPY . .
2024-10-17 19:25:06 +02:00
ARG GIT_REPO_CHECK = 0
RUN --mount= type = bind,source= .git,target= .git \
2025-03-06 16:08:36 -05:00
if [ " $GIT_REPO_CHECK " != "0" ] ; then bash tools/check_repo.sh ; fi
2023-10-31 12:36:47 -07:00
# max jobs used by Ninja to build extensions
2023-12-08 09:53:47 -08:00
ARG max_jobs = 2
ENV MAX_JOBS = ${ max_jobs }
2023-12-07 16:00:32 -03:00
# number of threads used by nvcc
ARG nvcc_threads = 8
ENV NVCC_THREADS = $nvcc_threads
2023-12-08 09:53:47 -08:00
2024-06-12 17:58:12 -07:00
ARG USE_SCCACHE
2025-06-27 18:04:39 +02:00
ARG SCCACHE_DOWNLOAD_URL = https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
2024-08-22 13:10:55 -07:00
ARG SCCACHE_BUCKET_NAME = vllm-build-sccache
ARG SCCACHE_REGION_NAME = us-west-2
2024-09-16 15:11:27 -07:00
ARG SCCACHE_S3_NO_CREDENTIALS = 0
2025-07-15 22:53:57 -04:00
# Flag to control whether to use pre-built vLLM wheels
2025-08-10 19:29:02 -04:00
ARG VLLM_USE_PRECOMPILED = ""
2025-07-15 22:53:57 -04:00
2024-06-12 17:58:12 -07:00
# if USE_SCCACHE is set, use sccache to speed up compilation
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " = "1" ] ; then \
echo "Installing sccache..." \
2025-06-27 18:04:39 +02:00
&& curl -L -o sccache.tar.gz ${ SCCACHE_DOWNLOAD_URL } \
2024-06-12 17:58:12 -07:00
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ SCCACHE_ENDPOINT } ] ; then export SCCACHE_ENDPOINT = ${ SCCACHE_ENDPOINT } ; fi \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_BUCKET = ${ SCCACHE_BUCKET_NAME } \
&& export SCCACHE_REGION = ${ SCCACHE_REGION_NAME } \
2024-09-16 15:11:27 -07:00
&& export SCCACHE_S3_NO_CREDENTIALS = ${ SCCACHE_S3_NO_CREDENTIALS } \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_IDLE_TIMEOUT = 0 \
2024-07-05 17:19:53 -07:00
&& export CMAKE_BUILD_TYPE = Release \
2025-08-10 19:29:02 -04:00
&& export VLLM_USE_PRECOMPILED = " ${ VLLM_USE_PRECOMPILED } " \
&& export VLLM_DOCKER_BUILD_CONTEXT = 1 \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats \
2024-07-14 21:54:46 -04:00
&& python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38 \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats; \
fi
2025-08-15 16:16:23 -07:00
ARG vllm_target_device = "cuda"
ENV VLLM_TARGET_DEVICE = ${ vllm_target_device }
2024-03-28 22:14:24 -07:00
ENV CCACHE_DIR = /root/.cache/ccache
RUN --mount= type = cache,target= /root/.cache/ccache \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " != "1" ] ; then \
2025-03-06 16:08:36 -05:00
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
2025-08-10 19:29:02 -04:00
export VLLM_USE_PRECOMPILED = " ${ VLLM_USE_PRECOMPILED } " && \
export VLLM_DOCKER_BUILD_CONTEXT = 1 && \
2024-07-14 21:54:46 -04:00
python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38; \
2024-06-12 17:58:12 -07:00
fi
2024-04-04 21:53:16 -07:00
2024-09-04 14:17:05 +08:00
# Check the size of the wheel if RUN_WHEEL_CHECK is true
2024-05-04 13:44:36 -07:00
COPY .buildkite/check-wheel-size.py check-wheel-size.py
2025-01-24 17:54:29 +08:00
# sync the default value with .buildkite/check-wheel-size.py
2025-09-04 05:47:59 -04:00
ARG VLLM_MAX_SIZE_MB = 450
2024-09-04 14:17:05 +08:00
ENV VLLM_MAX_SIZE_MB = $VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK = true
RUN if [ " $RUN_WHEEL_CHECK " = "true" ] ; then \
python3 check-wheel-size.py dist; \
else \
echo "Skipping wheel size check." ; \
fi
2024-01-14 12:37:58 -08:00
#################### EXTENSION Build IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
2025-07-23 23:41:23 -04:00
FROM base AS dev
2024-06-18 11:00:36 -07:00
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-08-05 09:57:10 -04:00
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE = copy
2025-04-29 19:08:04 -07:00
2025-09-03 22:17:20 +08:00
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
2025-03-08 17:44:35 +01:00
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/dev.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
2024-12-16 17:20:49 +08:00
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
# image with vLLM installed
2025-06-27 18:04:39 +02:00
FROM ${FINAL_BASE_IMAGE } AS vllm-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-01-14 12:37:58 -08:00
WORKDIR /vllm-workspace
2024-08-22 13:10:55 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2025-05-21 22:32:06 +08:00
SHELL [ "/bin/bash" , "-c" ]
2025-06-27 18:04:39 +02:00
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
2024-08-22 13:10:55 -07:00
RUN PYTHON_VERSION_STR = $( echo ${ PYTHON_VERSION } | sed 's/\.//g' ) && \
echo " export PYTHON_VERSION_STR= ${ PYTHON_VERSION_STR } " >> /etc/environment
2024-04-04 21:53:16 -07:00
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
2025-06-27 18:04:39 +02:00
&& if [ ! -z ${ DEADSNAKES_MIRROR_URL } ] ; then \
if [ ! -z " ${ DEADSNAKES_GPGKEY_URL } " ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
curl -L ${ DEADSNAKES_GPGKEY_URL } | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo " deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${ DEADSNAKES_MIRROR_URL } $( lsb_release -cs) main " > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo " Attempt $i failed, retrying in 5s... " ; sleep 5; } ; \
done ; \
fi \
2025-03-24 05:53:10 -07:00
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
2025-06-27 18:04:39 +02:00
&& curl -sS ${ GET_PIP_URL } | python${ PYTHON_VERSION } \
2025-03-24 05:53:10 -07:00
&& python3 --version && python3 -m pip --version
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
2025-03-24 05:53:10 -07:00
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2024-04-04 21:53:16 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-08-05 09:57:10 -04:00
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE = copy
2025-03-05 20:09:29 -05:00
2024-04-04 21:53:16 -07:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-04-04 21:53:16 -07:00
2024-12-19 18:13:06 -08:00
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-19 18:13:06 -08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-06-27 18:04:39 +02:00
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system \
--index-url ${ PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
--pre pytorch_triton = = 3.3.0+gitab727c40 ; \
2024-12-19 18:13:06 -08:00
fi
2024-12-16 17:20:49 +08:00
# Install vllm wheel first, so that torch etc will be installed.
2024-04-04 21:53:16 -07:00
RUN --mount= type = bind,from= build,src= /workspace/dist,target= /vllm-workspace/dist \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system dist/*.whl --verbose \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2024-07-03 02:11:29 +03:00
2025-02-15 21:33:13 +08:00
# If we need to build FlashInfer wheel before its release:
2025-01-28 02:19:24 +08:00
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
2025-06-26 08:23:56 +09:00
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
2025-01-28 02:19:24 +08:00
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
2025-06-11 10:57:28 -04:00
# $ git checkout v0.2.6.post1
# $ python -m flashinfer.aot
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
2025-01-28 02:19:24 +08:00
2025-07-16 22:37:13 -04:00
# Install FlashInfer from source
2025-06-27 18:04:39 +02:00
ARG FLASHINFER_GIT_REPO = "https://github.com/flashinfer-ai/flashinfer.git"
2025-08-20 08:05:54 -04:00
# Keep this in sync with "flashinfer" extra in setup.py
2025-09-15 20:45:55 +01:00
ARG FLASHINFER_GIT_REF = "v0.3.1"
2025-08-20 08:05:54 -04:00
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE = false
2025-07-02 20:15:11 -04:00
RUN --mount= type = cache,target= /root/.cache/uv bash - <<'BASH'
. /etc/environment
2025-07-16 22:37:13 -04:00
git clone --depth 1 --recursive --shallow-submodules \
--branch ${ FLASHINFER_GIT_REF } \
${ FLASHINFER_GIT_REPO } flashinfer
pushd flashinfer
2025-08-20 08:05:54 -04:00
if [ " ${ FLASHINFER_AOT_COMPILE } " = "true" ] ; then
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [ [ " ${ CUDA_VERSION } " = = 11.* ] ] ; then
FI_TORCH_CUDA_ARCH_LIST = "7.5 8.0 8.9"
elif [ [ " ${ CUDA_VERSION } " = = 12.[ 0-7] * ] ] ; then
FI_TORCH_CUDA_ARCH_LIST = "7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST = "7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo " 🏗️ Installing FlashInfer with AOT compilation for arches: ${ FI_TORCH_CUDA_ARCH_LIST } "
# Build AOT kernels
TORCH_CUDA_ARCH_LIST = " ${ FI_TORCH_CUDA_ARCH_LIST } " \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST = " ${ FI_TORCH_CUDA_ARCH_LIST } " \
uv pip install --system --no-build-isolation . \
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST = " ${ FI_TORCH_CUDA_ARCH_LIST } " \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
fi
2025-07-16 22:37:13 -04:00
popd
rm -rf flashinfer
2025-07-02 20:15:11 -04:00
BASH
2024-10-09 00:37:34 +08:00
COPY examples examples
2025-04-16 12:21:14 +08:00
COPY benchmarks benchmarks
2025-04-18 13:13:35 +08:00
COPY ./vllm/collect_env.py .
2025-01-28 02:19:24 +08:00
2025-04-29 19:08:04 -07:00
RUN --mount= type = cache,target= /root/.cache/uv \
. /etc/environment && \
uv pip list
2025-05-22 15:13:54 -04:00
# Even when we build Flashinfer with AOT mode, there's still
2025-01-28 02:19:24 +08:00
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-04-29 19:08:04 -07:00
uv pip install --system -r requirements/build.txt \
2025-06-27 18:04:39 +02:00
--extra-index-url ${ PYTORCH_CUDA_INDEX_BASE_URL } /cu$( echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
2025-01-28 02:19:24 +08:00
2025-07-31 21:01:55 -04:00
# Install DeepGEMM from source
2025-09-01 17:38:04 +08:00
ARG DEEPGEMM_GIT_REF
2025-08-22 22:52:50 -04:00
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount= type = cache,target= /root/.cache/uv \
2025-09-01 17:38:04 +08:00
VLLM_DOCKER_BUILD_CONTEXT = 1 /tmp/install_deepgemm.sh --cuda-version " ${ CUDA_VERSION } " ${ DEEPGEMM_GIT_REF : +--ref " $DEEPGEMM_GIT_REF " }
2025-07-31 21:01:55 -04:00
2025-08-22 13:34:40 -07:00
# Install EP kernels(pplx-kernels and DeepEP), NixL
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
COPY tools/install_nixl.sh install_nixl.sh
ENV CUDA_HOME = /usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST = " ${ TORCH_CUDA_ARCH_LIST :- 9 .0a+PTX } " \
&& bash install_python_libraries.sh \
&& bash install_nixl.sh --force
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
ADD . /vllm-workspace/
2023-10-31 12:36:47 -07:00
2025-06-27 18:04:39 +02:00
ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-04-29 19:08:04 -07:00
ENV UV_INDEX_STRATEGY = "unsafe-best-match"
2025-08-05 09:57:10 -04:00
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE = copy
2025-03-05 20:09:29 -05:00
2025-04-29 19:08:04 -07:00
# install development dependencies (for testing)
2025-06-27 18:04:39 +02:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-05-22 15:13:54 -04:00
CUDA_MAJOR = " ${ CUDA_VERSION %%.* } " ; \
if [ " $CUDA_MAJOR " -ge 12 ] ; then \
uv pip install --system -r requirements/dev.txt; \
fi
2024-03-14 18:55:54 +01:00
2024-11-26 00:20:04 -08:00
# install development dependencies (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -e tests/vllm_test_utils
2024-11-26 00:20:04 -08:00
2024-11-08 03:35:25 -05:00
# enable fast downloads from hf (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system hf_transfer
2024-11-08 03:35:25 -05:00
ENV HF_HUB_ENABLE_HF_TRANSFER 1
2024-11-06 12:57:35 -07:00
# Copy in the v1 package for testing (it isn't distributed yet)
2025-06-27 18:04:39 +02:00
COPY vllm/v1 /usr/local/lib/python${ PYTHON_VERSION } /dist-packages/vllm/v1
2024-11-06 12:57:35 -07:00
2025-08-15 13:58:06 +01:00
# Source code is used in the `python_only_compile.sh` test
# We hide it inside `src/` so that this source code
2024-04-04 21:53:16 -07:00
# will not be imported by other tests
2025-08-15 13:58:06 +01:00
RUN mkdir src
RUN mv vllm src/vllm
2024-04-04 21:53:16 -07:00
#################### TEST IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################
2025-01-03 10:59:25 +11:00
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
2025-04-28 04:38:14 +02:00
ARG TARGETPLATFORM
2025-07-04 21:49:18 +08:00
ARG INSTALL_KV_CONNECTORS = false
2024-04-04 21:53:16 -07:00
2025-06-27 18:04:39 +02:00
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-07-04 21:49:18 +08:00
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
2023-12-12 17:34:17 -08:00
# install additional dependencies for openai api server
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-07-04 21:49:18 +08:00
if [ " $INSTALL_KV_CONNECTORS " = "true" ] ; then \
uv pip install --system -r requirements/kv_connectors.txt; \
fi ; \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-07-16 22:37:13 -04:00
BITSANDBYTES_VERSION = "0.42.0" ; \
2024-12-16 17:20:49 +08:00
else \
2025-07-16 22:37:13 -04:00
BITSANDBYTES_VERSION = "0.46.1" ; \
fi ; \
2025-09-10 15:20:59 +02:00
uv pip install --system accelerate hf_transfer modelscope " bitsandbytes>= ${ BITSANDBYTES_VERSION } " 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[ s3]
2024-12-19 18:13:06 -08:00
2024-03-28 22:16:12 -07:00
ENV VLLM_USAGE_SOURCE production-docker-image
2025-01-03 10:59:25 +11:00
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
2025-01-08 13:09:53 +00:00
COPY examples/online_serving/sagemaker-entrypoint.sh .
2025-01-03 10:59:25 +11:00
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT [ "./sagemaker-entrypoint.sh" ]
FROM vllm-openai-base AS vllm-openai
2023-10-31 12:36:47 -07:00
ENTRYPOINT [ "python3" , "-m" , "vllm.entrypoints.openai.api_server" ]
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################