Updates for PyTorch 2.9, CUDA13

This commit is contained in:
Rajesh Shashi Kumar
2025-10-20 20:16:06 +00:00
parent 02430037ea
commit ebcdb4ab50
2 changed files with 49 additions and 18 deletions

View File

@@ -1,4 +1,4 @@
ARG CUDA_VERSION=12.9.0
ARG CUDA_VERSION=13.0.1
ARG IMAGE_DISTRO=ubuntu24.04
ARG PYTHON_VERSION=3.12
@@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
# 'a' suffix is not forward compatible but enables all optimizations
ARG TORCH_CUDA_ARCH_LIST="9.0a"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV UV_TORCH_BACKEND=cu129
# ENV UV_TORCH_BACKEND=cu130
ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real"
ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES}
@@ -46,12 +46,15 @@ ENV VIRTUAL_ENV=/workspace/.venv
ENV PATH=${VIRTUAL_ENV}/bin:${PATH}
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV CPLUS_INCLUDE_PATH=${CUDA_HOME}/include/cccl
ENV C_INCLUDE_PATH=${CUDA_HOME}/include/cccl
ENV PATH=${CUDA_HOME}/cuda/bin:${PATH}
RUN apt-get update && apt install -y wget
RUN uv pip install numpy==2.0.0
# Install pytorch nightly
RUN uv pip install torch==2.8.0+cu129 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 --torch-backend=cu129
RUN uv pip install torch==2.9.0+cu130 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130
FROM base AS build-base
RUN mkdir /wheels
@@ -62,7 +65,7 @@ RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
FROM build-base AS build-triton
ARG TRITON_REF=release/3.4.x
ARG TRITON_BUILD_SUFFIX=+cu129
ARG TRITON_BUILD_SUFFIX=+cu130
ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-}
RUN git clone https://github.com/triton-lang/triton.git
RUN cd triton && \
@@ -71,23 +74,23 @@ RUN cd triton && \
git submodule update --init --recursive -j 8 && \
uv build --wheel --no-build-isolation -o /wheels
RUN export MAX_JOBS=6
FROM build-base AS build-xformers
ARG XFORMERS_REF=v0.0.32.post2
ARG XFORMERS_BUILD_VERSION=0.0.30+cu129
ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
RUN git clone https://github.com/facebookresearch/xformers.git
#ARG XFORMERS_REF=v0.0.32.post2
#ARG XFORMERS_BUILD_VERSION=0.0.30+cu130
#ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
RUN git clone https://github.com/johnnynunez/xformers.git
# https://github.com/facebookresearch/xformers.git
RUN cd xformers && \
git checkout ${XFORMERS_REF} && \
# git checkout ${XFORMERS_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
uv build --wheel --no-build-isolation -o /wheels
MAX_JOBS=6 uv build --wheel --no-build-isolation -o /wheels
# Currently not supported on CUDA 12.8
FROM build-base AS build-flashinfer
ARG FLASHINFER_ENABLE_AOT=1
ARG FLASHINFER_REF=v0.4.1
ARG FLASHINFER_BUILD_SUFFIX=cu129
ARG FLASHINFER_BUILD_SUFFIX=cu130
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
RUN cd flashinfer && \
@@ -100,20 +103,51 @@ FROM build-base AS build-lmcache
ARG LMCACHE_REF=v0.3.7
RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \
cd LMCache && \
# PyTorch version is dated in LMCache
sed -i '/torch/d' pyproject.toml && \
uv pip install setuptools_scm && \
python -m build --wheel --no-isolation && \
cp dist/*.whl /wheels/
# Build Flash Attention with the proven working approach
FROM build-base AS build-flash-attention
RUN apt-get update && apt-get install -y build-essential cmake gcc && \
git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \
cd flash-attention && \
mkdir wheels && \
export MAX_JOBS=8 && \
export NVCC_THREADS=1 && \
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
MAX_JOBS=$MAX_JOBS \
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
TORCH_CUDA_ARCH_LIST="9.0a" \
FLASH_ATTENTION_FORCE_BUILD="TRUE" \
FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
pip3 wheel . -v --no-deps -w ./wheels/ && \
cp wheels/*.whl /wheels/
FROM build-base AS build-vllm
ARG VLLM_REF=v0.11.0
# Install ccache for faster compilation
RUN apt-get update && apt-get install -y ccache
# Copy Flash Attention wheel to use during vLLM build
COPY --from=build-flash-attention /wheels/* /tmp/fa-wheels/
RUN git clone https://github.com/vllm-project/vllm.git
RUN uv pip install /tmp/fa-wheels/flash_attn*.whl
RUN cd vllm && \
git checkout ${VLLM_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
export MAX_JOBS=16 && \
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
python use_existing_torch.py && \
uv pip install -r requirements/build.txt && \
MAX_JOBS=16 uv build --wheel --no-build-isolation -o /wheels
pip install -r requirements/build.txt && \
MAX_JOBS=$MAX_JOBS \
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
TORCH_CUDA_ARCH_LIST="9.0a" \
FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
pip install -e . --no-build-isolation
# Build infinistore after vllm to avoid cache invalidation
FROM build-base AS build-infinistore
@@ -157,9 +191,6 @@ RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 run
# Clean uv cache
RUN uv clean
# python3-config https://github.com/astral-sh/uv/issues/10263
RUN export PATH="$(dirname $(realpath .venv/bin/python)):$PATH"
# Install build tools and dependencies
RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel

View File

@@ -6,6 +6,6 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm)
docker login
# Alternative
# docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 .
docker build --memory=300g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 .
docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 .
docker push rajesh550/gh200-vllm:0.11.0
```