From 3c4796ed55a9ed2768248d32e9837fd039da80c7 Mon Sep 17 00:00:00 2001
From: Rajesh Shashi Kumar <35628747+rajesh-s@users.noreply.github.com>
Date: Tue, 21 Oct 2025 19:21:13 +0000
Subject: [PATCH] Updated for CUDA 13

---
 README.md       | 18 ++++++++++++++++
 vllm/.gitignore |  1 +
 vllm/Dockerfile | 56 +++++++++++++++++++++----------------------------
 vllm/README.md  |  4 ++--
 4 files changed, 45 insertions(+), 34 deletions(-)
 create mode 100644 vllm/.gitignore

diff --git a/README.md b/README.md
index d5f1e93..41ba236 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,31 @@
 
 Currently, prebuilt wheels for `vLLM` and `LMcache` are not available for `aarch64`. This can make setup tedious when working on modern `aarch64` platforms such as NVIDIA GH200.
 
+Further, Nvidia at this time does not provide the `Dockerfile` associated with the NGC containers which makes replacing some of the components (like a newer version of vLLM) tedious.
+
 This repository provides a Dockerfile to build a container with vLLM and all its dependencies pre-installed to try out various things such as KV offloading.
 
 If you prefer not to build the image yourself, you can pull the ready-to-use image directly from Docker Hub:
 
 ```bash
 docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.0 bash
+
+# CUDA 13
+docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.1rc1 bash
 ```
 
 👉 [Docker Hub](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm/general)
 
+Version info:
+
+```bash
+CUDA: 13.0.1
+Ubuntu: 24.04
+Python: 3.12
+PyTorch: 2.9.0+cu130
+Triton: 3.5.x
+xformers: 0.32.post2+
+flashinfer: 0.4.0
+LMCache: 0.3.7
+vLLM: 0.11.1rc1
+```
\ No newline at end of file
diff --git a/vllm/.gitignore b/vllm/.gitignore
new file mode 100644
index 0000000..1b06175
--- /dev/null
+++ b/vllm/.gitignore
@@ -0,0 +1 @@
+build.log
\ No newline at end of file
diff --git a/vllm/Dockerfile b/vllm/Dockerfile
index 0957bfd..c03a64c 100644
--- a/vllm/Dockerfile
+++ b/vllm/Dockerfile
@@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
 # 'a' suffix is not forward compatible but enables all optimizations
 ARG TORCH_CUDA_ARCH_LIST="9.0a"
 ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
-# ENV UV_TORCH_BACKEND=cu130
+ENV UV_TORCH_BACKEND=cu130
 ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real"
 ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES}
 
@@ -64,7 +64,7 @@ RUN mkdir /wheels
 RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
 
 FROM build-base AS build-triton
-ARG TRITON_REF=release/3.4.x
+ARG TRITON_REF=release/3.5.x
 ARG TRITON_BUILD_SUFFIX=+cu130
 ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-}
 RUN git clone https://github.com/triton-lang/triton.git
@@ -86,10 +86,9 @@ RUN cd xformers && \
     git submodule update --init --recursive -j 8 && \
     MAX_JOBS=6 uv build --wheel --no-build-isolation -o /wheels
 
-# Currently not supported on CUDA 12.8
 FROM build-base AS build-flashinfer
 ARG FLASHINFER_ENABLE_AOT=1
-ARG FLASHINFER_REF=v0.4.1
+ARG FLASHINFER_REF=v0.4.0
 ARG FLASHINFER_BUILD_SUFFIX=cu130
 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
@@ -109,45 +108,38 @@ RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \
     python -m build --wheel --no-isolation && \
     cp dist/*.whl /wheels/
 
-# Build Flash Attention with the proven working approach
-FROM build-base AS build-flash-attention
-RUN apt-get update && apt-get install -y build-essential cmake gcc && \
-    git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \
-    cd flash-attention && \
-    mkdir wheels && \
-    export MAX_JOBS=8 && \
-    export NVCC_THREADS=1 && \
-    export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
-    MAX_JOBS=$MAX_JOBS \
-    CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
-    TORCH_CUDA_ARCH_LIST="9.0a" \
-    FLASH_ATTENTION_FORCE_BUILD="TRUE" \
-    FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
-    FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
-    pip3 wheel . -v --no-deps -w ./wheels/ && \
-    cp wheels/*.whl /wheels/
+
+# FROM build-base AS build-flash-attention
+# RUN apt-get update && apt-get install -y build-essential cmake gcc && \
+#     git clone --depth=1 https://github.com/Dao-AILab/flash-attention flash-attention && \
+#     cd flash-attention/hopper && \
+#     mkdir wheels && \
+#     export MAX_JOBS=8 && \
+#     export NVCC_THREADS=1 && \
+#     export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
+#     MAX_JOBS=$MAX_JOBS \
+#     CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
+#     FLASH_ATTENTION_FORCE_BUILD="TRUE" \
+#     FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
+#     FLASH_ATTENTION_SKIP_CUDA_BUILD="FALSE" \
+#     pip3 wheel . -v --no-deps -w ./wheels/ && \
+#     cp wheels/*.whl /wheels/
 
 FROM build-base AS build-vllm
-ARG VLLM_REF=v0.11.0
+ARG VLLM_REF=v0.11.1rc1
 # Install ccache for faster compilation
 RUN apt-get update && apt-get install -y ccache
-# Copy Flash Attention wheel to use during vLLM build
-COPY --from=build-flash-attention /wheels/* /tmp/fa-wheels/
 RUN git clone https://github.com/vllm-project/vllm.git
-RUN uv pip install /tmp/fa-wheels/flash_attn*.whl
 RUN cd vllm && \
     git checkout ${VLLM_REF} && \
     git submodule sync && \
     git submodule update --init --recursive -j 8 && \
-    export MAX_JOBS=16 && \
+    sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
+    export MAX_JOBS=8 && \
     export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
     python use_existing_torch.py && \
-    pip install -r requirements/build.txt && \
-    MAX_JOBS=$MAX_JOBS \
-    CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
-    TORCH_CUDA_ARCH_LIST="9.0a" \
-    FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE" \
-    pip install -e . --no-build-isolation
+    uv pip install -r requirements/build.txt && \
+    CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
 
 # Build infinistore after vllm to avoid cache invalidation
 FROM build-base AS build-infinistore
diff --git a/vllm/README.md b/vllm/README.md
index 460db26..7b9d65a 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -6,6 +6,6 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm)
  docker login
 # Alternative
 # docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 .
- docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 .  
- docker push rajesh550/gh200-vllm:0.11.0
+ docker build --memory=450g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.1rc1 . 2>&1 | tee build.log 
+ docker push rajesh550/gh200-vllm:0.11.1rc1
 ```
\ No newline at end of file