PyTorch 2.9.0+cu130 was compiled with CUDA 12.8 but container has CUDA 13.0. Skip CUDA extension build to avoid version mismatch.
215 lines
7.9 KiB
Docker
215 lines
7.9 KiB
Docker
ARG CUDA_VERSION=12.8.1
|
|
ARG IMAGE_DISTRO=ubuntu24.04
|
|
ARG PYTHON_VERSION=3.12
|
|
|
|
# ---------- Builder Base ----------
|
|
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
|
|
|
|
# Set arch lists for all targets
|
|
# 'a' suffix is not forward compatible but enables all optimizations
|
|
ARG TORCH_CUDA_ARCH_LIST="9.0a"
|
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
|
ENV UV_TORCH_BACKEND=cu128
|
|
ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real"
|
|
ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES}
|
|
|
|
# Update apt packages and install dependencies
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
RUN apt update
|
|
RUN apt upgrade -y
|
|
RUN apt install -y --no-install-recommends \
|
|
curl \
|
|
git \
|
|
libibverbs-dev \
|
|
zlib1g-dev \
|
|
libnuma-dev
|
|
|
|
# Clean apt cache
|
|
RUN apt clean
|
|
RUN rm -rf /var/lib/apt/lists/*
|
|
RUN rm -rf /var/cache/apt/archives
|
|
|
|
# Set compiler paths
|
|
ENV CC=/usr/bin/gcc
|
|
ENV CXX=/usr/bin/g++
|
|
ENV QEMU_CPU=max
|
|
|
|
# Install uv
|
|
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
|
|
|
|
# Setup build workspace
|
|
WORKDIR /workspace
|
|
|
|
# Prep build venv
|
|
ARG PYTHON_VERSION
|
|
RUN uv venv -p ${PYTHON_VERSION} --seed --python-preference only-managed
|
|
ENV VIRTUAL_ENV=/workspace/.venv
|
|
ENV PATH=${VIRTUAL_ENV}/bin:${PATH}
|
|
ENV CUDA_HOME=/usr/local/cuda
|
|
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
|
ENV CPLUS_INCLUDE_PATH=${CUDA_HOME}/include/cccl
|
|
ENV C_INCLUDE_PATH=${CUDA_HOME}/include/cccl
|
|
ENV PATH=${CUDA_HOME}/cuda/bin:${PATH}
|
|
|
|
RUN apt-get update && apt install -y wget
|
|
|
|
RUN uv pip install numpy==2.0.0
|
|
# Install pytorch nightly
|
|
RUN uv pip install torch==2.7.0+cu128 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128
|
|
|
|
FROM base AS build-base
|
|
RUN mkdir /wheels
|
|
|
|
# Install build deps that aren't in project requirements files
|
|
# Pin setuptools to <81 for LMCache compatibility (needs >=77.0.3,<81.0.0)
|
|
RUN uv pip install -U build cmake ninja pybind11 "setuptools>=77.0.3,<81.0.0" wheel
|
|
|
|
# Use PyPI triton wheel instead of building (QEMU segfaults during triton build)
|
|
FROM build-base AS build-triton
|
|
RUN mkdir -p /wheels && \
|
|
pip download triton==3.6.0 --platform manylinux_2_27_aarch64 --only-binary=:all: --no-deps -d /wheels
|
|
|
|
# Skip xformers - vLLM has built-in FlashAttention kernels
|
|
# xformers requires TORCH_STABLE_ONLY which needs PyTorch headers not in 2.9.0
|
|
# FROM build-base AS build-xformers
|
|
# RUN git clone https://github.com/facebookresearch/xformers.git
|
|
# RUN cd xformers && \
|
|
# git submodule sync && \
|
|
# git submodule update --init --recursive -j 8 && \
|
|
# MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
|
|
|
|
FROM build-base AS build-flashinfer
|
|
ARG FLASHINFER_ENABLE_AOT=1
|
|
ARG FLASHINFER_REF=v0.6.6
|
|
ARG FLASHINFER_BUILD_SUFFIX=cu130
|
|
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
|
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
|
RUN cd flashinfer && \
|
|
git checkout ${FLASHINFER_REF} && \
|
|
git submodule sync && \
|
|
git submodule update --init --recursive -j 8 && \
|
|
uv build --wheel --no-build-isolation -o /wheels
|
|
|
|
FROM build-base AS build-lmcache
|
|
# Bleeding edge: build from dev branch (v0.4.2+)
|
|
RUN git clone https://github.com/LMCache/LMCache.git && \
|
|
cd LMCache && \
|
|
git checkout dev && \
|
|
# PyTorch version is dated in LMCache
|
|
sed -i '/torch/d' pyproject.toml && \
|
|
uv pip install setuptools_scm && \
|
|
NO_CUDA_EXT=1 MAX_JOBS=8 python -m build --wheel --no-isolation && \
|
|
cp dist/*.whl /wheels/
|
|
|
|
|
|
FROM build-base AS build-flash-attention
|
|
RUN apt-get update && apt-get install -y build-essential cmake gcc && \
|
|
git clone https://github.com/Dao-AILab/flash-attention flash-attention && \
|
|
cd flash-attention/hopper && \
|
|
mkdir wheels && \
|
|
export MAX_JOBS=8 && \
|
|
export NVCC_THREADS=4 && \
|
|
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
|
|
MAX_JOBS=$MAX_JOBS \
|
|
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
|
|
FLASH_ATTENTION_FORCE_BUILD="TRUE" \
|
|
FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
|
|
FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
|
|
pip wheel . -v --no-deps --no-build-isolation -w ./wheels/ && \
|
|
cp wheels/*.whl /wheels/
|
|
|
|
# ==============================================================================
|
|
# NOTE: Temporarily using PyPI vLLM wheel for QEMU testing
|
|
# To restore native build on GH200, uncomment the block below and comment out
|
|
# the PyPI download section.
|
|
# ==============================================================================
|
|
FROM build-base AS build-vllm
|
|
# Bleeding edge: build from main branch
|
|
ARG VLLM_REF=main
|
|
# Install ccache for faster compilation
|
|
RUN apt-get update && apt-get install -y ccache
|
|
RUN git clone https://github.com/vllm-project/vllm.git
|
|
RUN cd vllm && \
|
|
git checkout ${VLLM_REF} && \
|
|
git submodule sync && \
|
|
git submodule update --init --recursive -j 8 && \
|
|
sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
|
|
sed -i '/size_t fail_idx = 0;/d' csrc/cache_kernels.cu && \
|
|
sed -i 's/, \&fail_idx,/,/' csrc/cache_kernels.cu && \
|
|
sed -i 's/"cuMemcpyBatchAsync failed at index ",\s*fail_idx, " with error "/"cuMemcpyBatchAsync failed with error "/' csrc/cache_kernels.cu && \
|
|
export MAX_JOBS=8 && \
|
|
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
|
|
python use_existing_torch.py && \
|
|
uv pip install -r requirements/build.txt && \
|
|
CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
|
|
|
|
# Use PyPI vLLM wheel (QEMU cmake fails during try_compile)
|
|
# FROM build-base AS build-vllm
|
|
# ARG VLLM_VERSION=0.18.1
|
|
# RUN mkdir -p /wheels && \
|
|
# pip download vllm==${VLLM_VERSION} --platform manylinux_2_31_aarch64 --only-binary=:all: --no-deps -d /wheels
|
|
|
|
# Build infinistore after vllm to avoid cache invalidation
|
|
FROM build-base AS build-infinistore
|
|
# Install additional dependencies needed for building infinistore on aarch64
|
|
RUN apt update && apt install -y cmake pybind11-dev python3-dev libuv1-dev libspdlog-dev libboost-dev libboost-all-dev meson
|
|
|
|
# Build flatbuffers from source with proper CMake version
|
|
RUN git clone -b v1.12.0 https://github.com/google/flatbuffers.git && \
|
|
cd flatbuffers && \
|
|
cmake -B build -DFLATBUFFERS_BUILD_TESTS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
|
|
cmake --build build -j && \
|
|
cmake --install build
|
|
|
|
# Build InfiniStore from source as a Python package
|
|
RUN git clone https://github.com/bytedance/InfiniStore && \
|
|
cd InfiniStore && \
|
|
uv pip install meson && \
|
|
uv pip install --no-deps --no-build-isolation -e . && \
|
|
uv pip uninstall infinistore && \
|
|
python -m build --wheel --no-isolation && \
|
|
cp dist/*.whl /wheels/
|
|
|
|
FROM base AS vllm-openai
|
|
COPY --from=build-flash-attention /wheels/* wheels/
|
|
COPY --from=build-flashinfer /wheels/* wheels/
|
|
COPY --from=build-triton /wheels/* wheels/
|
|
COPY --from=build-vllm /wheels/* wheels/
|
|
COPY --from=build-lmcache /wheels/* wheels/
|
|
COPY --from=build-infinistore /wheels/* wheels/
|
|
|
|
# Install wheels (infinistore is now built as a wheel)
|
|
RUN uv pip install wheels/*
|
|
RUN rm -r wheels
|
|
|
|
# Install pynvml
|
|
RUN uv pip install pynvml pandas
|
|
|
|
# Add additional packages for vLLM OpenAI
|
|
# Bleeding edge: latest transformers
|
|
RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 runai-model-streamer runai-model-streamer[s3] tensorizer transformers --upgrade
|
|
|
|
# Clean uv cache
|
|
RUN uv clean
|
|
|
|
# Install build tools and dependencies
|
|
RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel
|
|
|
|
# Enable hf-transfer
|
|
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
|
RUN uv pip install datasets aiohttp
|
|
|
|
# Install nsys for profiling
|
|
ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_5/
|
|
ARG NSYS_PKG=nsight-systems-cli-2025.5.1_2025.5.1.121-1_arm64.deb
|
|
RUN apt-get update && apt install -y wget libglib2.0-0
|
|
RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
|
|
RUN apt install -y --no-install-recommends tmux cmake
|
|
|
|
# Deprecated cleanup
|
|
RUN uv pip uninstall pynvml && uv pip install nvidia-ml-py
|
|
|
|
# API server entrypoint
|
|
# ENTRYPOINT ["vllm", "serve"]
|
|
CMD ["/bin/bash"]
|