Updated for CUDA 13
This commit is contained in:
18
README.md
18
README.md
@@ -2,13 +2,31 @@
|
||||
|
||||
Currently, prebuilt wheels for `vLLM` and `LMcache` are not available for `aarch64`. This can make setup tedious when working on modern `aarch64` platforms such as NVIDIA GH200.
|
||||
|
||||
Further, Nvidia at this time does not provide the `Dockerfile` associated with the NGC containers which makes replacing some of the components (like a newer version of vLLM) tedious.
|
||||
|
||||
This repository provides a Dockerfile to build a container with vLLM and all its dependencies pre-installed to try out various things such as KV offloading.
|
||||
|
||||
If you prefer not to build the image yourself, you can pull the ready-to-use image directly from Docker Hub:
|
||||
|
||||
```bash
|
||||
docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.0 bash
|
||||
|
||||
# CUDA 13
|
||||
docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.1rc1 bash
|
||||
```
|
||||
|
||||
👉 [Docker Hub](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm/general)
|
||||
|
||||
Version info:
|
||||
|
||||
```bash
|
||||
CUDA: 13.0.1
|
||||
Ubuntu: 24.04
|
||||
Python: 3.12
|
||||
PyTorch: 2.9.0+cu130
|
||||
Triton: 3.5.x
|
||||
xformers: 0.32.post2+
|
||||
flashinfer: 0.4.0
|
||||
LMCache: 0.3.7
|
||||
vLLM: 0.11.1rc1
|
||||
```
|
||||
1
vllm/.gitignore
vendored
Normal file
1
vllm/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
build.log
|
||||
@@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
|
||||
# 'a' suffix is not forward compatible but enables all optimizations
|
||||
ARG TORCH_CUDA_ARCH_LIST="9.0a"
|
||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
# ENV UV_TORCH_BACKEND=cu130
|
||||
ENV UV_TORCH_BACKEND=cu130
|
||||
ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real"
|
||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES}
|
||||
|
||||
@@ -64,7 +64,7 @@ RUN mkdir /wheels
|
||||
RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
|
||||
|
||||
FROM build-base AS build-triton
|
||||
ARG TRITON_REF=release/3.4.x
|
||||
ARG TRITON_REF=release/3.5.x
|
||||
ARG TRITON_BUILD_SUFFIX=+cu130
|
||||
ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-}
|
||||
RUN git clone https://github.com/triton-lang/triton.git
|
||||
@@ -86,10 +86,9 @@ RUN cd xformers && \
|
||||
git submodule update --init --recursive -j 8 && \
|
||||
MAX_JOBS=6 uv build --wheel --no-build-isolation -o /wheels
|
||||
|
||||
# Currently not supported on CUDA 12.8
|
||||
FROM build-base AS build-flashinfer
|
||||
ARG FLASHINFER_ENABLE_AOT=1
|
||||
ARG FLASHINFER_REF=v0.4.1
|
||||
ARG FLASHINFER_REF=v0.4.0
|
||||
ARG FLASHINFER_BUILD_SUFFIX=cu130
|
||||
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
@@ -109,45 +108,38 @@ RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \
|
||||
python -m build --wheel --no-isolation && \
|
||||
cp dist/*.whl /wheels/
|
||||
|
||||
# Build Flash Attention with the proven working approach
|
||||
FROM build-base AS build-flash-attention
|
||||
RUN apt-get update && apt-get install -y build-essential cmake gcc && \
|
||||
git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \
|
||||
cd flash-attention && \
|
||||
mkdir wheels && \
|
||||
export MAX_JOBS=8 && \
|
||||
export NVCC_THREADS=1 && \
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
|
||||
MAX_JOBS=$MAX_JOBS \
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
|
||||
TORCH_CUDA_ARCH_LIST="9.0a" \
|
||||
FLASH_ATTENTION_FORCE_BUILD="TRUE" \
|
||||
FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
|
||||
FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
|
||||
pip3 wheel . -v --no-deps -w ./wheels/ && \
|
||||
cp wheels/*.whl /wheels/
|
||||
|
||||
# FROM build-base AS build-flash-attention
|
||||
# RUN apt-get update && apt-get install -y build-essential cmake gcc && \
|
||||
# git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \
|
||||
# cd flash-attention/hopper && \
|
||||
# mkdir wheels && \
|
||||
# export MAX_JOBS=8 && \
|
||||
# export NVCC_THREADS=1 && \
|
||||
# export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
|
||||
# MAX_JOBS=$MAX_JOBS \
|
||||
# CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
|
||||
# FLASH_ATTENTION_FORCE_BUILD="TRUE" \
|
||||
# FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
|
||||
# FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
|
||||
# pip3 wheel . -v --no-deps -w ./wheels/ && \
|
||||
# cp wheels/*.whl /wheels/
|
||||
|
||||
FROM build-base AS build-vllm
|
||||
ARG VLLM_REF=v0.11.0
|
||||
ARG VLLM_REF=v0.11.1rc1
|
||||
# Install ccache for faster compilation
|
||||
RUN apt-get update && apt-get install -y ccache
|
||||
# Copy Flash Attention wheel to use during vLLM build
|
||||
COPY --from=build-flash-attention /wheels/* /tmp/fa-wheels/
|
||||
RUN git clone https://github.com/vllm-project/vllm.git
|
||||
RUN uv pip install /tmp/fa-wheels/flash_attn*.whl
|
||||
RUN cd vllm && \
|
||||
git checkout ${VLLM_REF} && \
|
||||
git submodule sync && \
|
||||
git submodule update --init --recursive -j 8 && \
|
||||
export MAX_JOBS=16 && \
|
||||
sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
|
||||
export MAX_JOBS=8 && \
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
|
||||
python use_existing_torch.py && \
|
||||
pip install -r requirements/build.txt && \
|
||||
MAX_JOBS=$MAX_JOBS \
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
|
||||
TORCH_CUDA_ARCH_LIST="9.0a" \
|
||||
FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
|
||||
pip install -e . --no-build-isolation
|
||||
uv pip install -r requirements/build.txt && \
|
||||
CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
|
||||
|
||||
# Build infinistore after vllm to avoid cache invalidation
|
||||
FROM build-base AS build-infinistore
|
||||
|
||||
@@ -6,6 +6,6 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm)
|
||||
docker login
|
||||
# Alternative
|
||||
# docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 .
|
||||
docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 .
|
||||
docker push rajesh550/gh200-vllm:0.11.0
|
||||
docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.1rc1 . 2>&1 | tee build.log
|
||||
docker push rajesh550/gh200-vllm:0.11.1rc1
|
||||
```
|
||||
Reference in New Issue
Block a user