From 3c4796ed55a9ed2768248d32e9837fd039da80c7 Mon Sep 17 00:00:00 2001 From: Rajesh Shashi Kumar <35628747+rajesh-s@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:21:13 +0000 Subject: [PATCH] Updated for CUDA 13 --- README.md | 18 ++++++++++++++++ vllm/.gitignore | 1 + vllm/Dockerfile | 56 +++++++++++++++++++++---------------------------- vllm/README.md | 4 ++-- 4 files changed, 45 insertions(+), 34 deletions(-) create mode 100644 vllm/.gitignore diff --git a/README.md b/README.md index d5f1e93..41ba236 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,31 @@ Currently, prebuilt wheels for `vLLM` and `LMcache` are not available for `aarch64`. This can make setup tedious when working on modern `aarch64` platforms such as NVIDIA GH200. +Further, Nvidia at this time does not provide the `Dockerfile` associated with the NGC containers which makes replacing some of the components (like a newer version of vLLM) tedious. + This repository provides a Dockerfile to build a container with vLLM and all its dependencies pre-installed to try out various things such as KV offloading. If you prefer not to build the image yourself, you can pull the ready-to-use image directly from Docker Hub: ```bash docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.0 bash + +# CUDA 13 +docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.1rc1 bash ``` 👉 [Docker Hub](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm/general) +Version info: + +```bash +CUDA: 13.0.1 +Ubuntu: 24.04 +Python: 3.12 +PyTorch: 2.9.0+cu130 +Triton: 3.5.x +xformers: 0.32.post2+ +flashinfer: 0.4.0 +LMCache: 0.3.7 +vLLM: 0.11.1rc1 +``` \ No newline at end of file diff --git a/vllm/.gitignore b/vllm/.gitignore new file mode 100644 index 0000000..1b06175 --- /dev/null +++ b/vllm/.gitignore @@ -0,0 +1 @@ +build.log \ No newline at end of file diff --git a/vllm/Dockerfile b/vllm/Dockerfile index 0957bfd..c03a64c 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base # 'a' suffix is not forward compatible but enables all optimizations ARG TORCH_CUDA_ARCH_LIST="9.0a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} -# ENV UV_TORCH_BACKEND=cu130 +ENV UV_TORCH_BACKEND=cu130 ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real" ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES} @@ -64,7 +64,7 @@ RUN mkdir /wheels RUN uv pip install -U build cmake ninja pybind11 setuptools wheel FROM build-base AS build-triton -ARG TRITON_REF=release/3.4.x +ARG TRITON_REF=release/3.5.x ARG TRITON_BUILD_SUFFIX=+cu130 ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-} RUN git clone https://github.com/triton-lang/triton.git @@ -86,10 +86,9 @@ RUN cd xformers && \ git submodule update --init --recursive -j 8 && \ MAX_JOBS=6 uv build --wheel --no-build-isolation -o /wheels -# Currently not supported on CUDA 12.8 FROM build-base AS build-flashinfer ARG FLASHINFER_ENABLE_AOT=1 -ARG FLASHINFER_REF=v0.4.1 +ARG FLASHINFER_REF=v0.4.0 ARG FLASHINFER_BUILD_SUFFIX=cu130 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} RUN git clone https://github.com/flashinfer-ai/flashinfer.git @@ -109,45 +108,38 @@ RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \ python -m build --wheel --no-isolation && \ cp dist/*.whl /wheels/ -# Build Flash Attention with the proven working approach -FROM build-base AS build-flash-attention -RUN apt-get update && apt-get install -y build-essential cmake gcc && \ - git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \ - cd flash-attention && \ - mkdir wheels && \ - export MAX_JOBS=8 && \ - export NVCC_THREADS=1 && \ - export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ - MAX_JOBS=$MAX_JOBS \ - CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \ - TORCH_CUDA_ARCH_LIST="9.0a" \ - FLASH_ATTENTION_FORCE_BUILD="TRUE" \ - FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \ - FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \ - pip3 wheel . -v --no-deps -w ./wheels/ && \ - cp wheels/*.whl /wheels/ + +# FROM build-base AS build-flash-attention +# RUN apt-get update && apt-get install -y build-essential cmake gcc && \ +# git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \ +# cd flash-attention/hopper && \ +# mkdir wheels && \ +# export MAX_JOBS=8 && \ +# export NVCC_THREADS=1 && \ +# export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ +# MAX_JOBS=$MAX_JOBS \ +# CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \ +# FLASH_ATTENTION_FORCE_BUILD="TRUE" \ +# FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \ +# FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \ +# pip3 wheel . -v --no-deps -w ./wheels/ && \ +# cp wheels/*.whl /wheels/ FROM build-base AS build-vllm -ARG VLLM_REF=v0.11.0 +ARG VLLM_REF=v0.11.1rc1 # Install ccache for faster compilation RUN apt-get update && apt-get install -y ccache -# Copy Flash Attention wheel to use during vLLM build -COPY --from=build-flash-attention /wheels/* /tmp/fa-wheels/ RUN git clone https://github.com/vllm-project/vllm.git -RUN uv pip install /tmp/fa-wheels/flash_attn*.whl RUN cd vllm && \ git checkout ${VLLM_REF} && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ - export MAX_JOBS=16 && \ + sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \ + export MAX_JOBS=8 && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ python use_existing_torch.py && \ - pip install -r requirements/build.txt && \ - MAX_JOBS=$MAX_JOBS \ - CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \ - TORCH_CUDA_ARCH_LIST="9.0a" \ - FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \ - pip install -e . --no-build-isolation + uv pip install -r requirements/build.txt && \ + CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels # Build infinistore after vllm to avoid cache invalidation FROM build-base AS build-infinistore diff --git a/vllm/README.md b/vllm/README.md index 460db26..7b9d65a 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -6,6 +6,6 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm) docker login # Alternative # docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 . - docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 . - docker push rajesh550/gh200-vllm:0.11.0 + docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.1rc1 . 2>&1 | tee build.log + docker push rajesh550/gh200-vllm:0.11.1rc1 ``` \ No newline at end of file