Skip xformers - vLLM has built-in FlashAttention kernels

xformers requires TORCH_STABLE_ONLY which needs torch/csrc/stable/ headers not present in PyTorch 2.9.0. vLLM 0.18.1 includes its own FA2/FA3 kernels.
2026-04-03 05:50:02 +00:00
parent 45b6109ee1
commit 9d88d4c7d8
1 changed files with 8 additions and 10 deletions
--- a/vllm/Dockerfile
+++ b/vllm/Dockerfile
@@ -69,15 +69,14 @@ FROM build-base AS build-triton
 RUN mkdir -p /wheels && \
    pip download triton==3.6.0 --platform manylinux_2_27_aarch64 --only-binary=:all: --no-deps -d /wheels

-FROM build-base AS build-xformers
-#ARG XFORMERS_REF=v0.0.32.post2
-#ARG XFORMERS_BUILD_VERSION=0.0.30+cu130
-#ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
-RUN git clone https://github.com/facebookresearch/xformers.git
-RUN cd xformers && \
-    git submodule sync && \
-    git submodule update --init --recursive -j 8 && \
-    MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
+# Skip xformers - vLLM has built-in FlashAttention kernels
+# xformers requires TORCH_STABLE_ONLY which needs PyTorch headers not in 2.9.0
+# FROM build-base AS build-xformers
+# RUN git clone https://github.com/facebookresearch/xformers.git
+# RUN cd xformers && \
+#     git submodule sync && \
+#     git submodule update --init --recursive -j 8 && \
+#     MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels

 FROM build-base AS build-flashinfer
 ARG FLASHINFER_ENABLE_AOT=1
@@ -173,7 +172,6 @@ COPY --from=build-flash-attention /wheels/* wheels/
 COPY --from=build-flashinfer /wheels/* wheels/
 COPY --from=build-triton /wheels/* wheels/
 COPY --from=build-vllm /wheels/* wheels/
-COPY --from=build-xformers /wheels/* wheels/
 COPY --from=build-lmcache /wheels/* wheels/
 COPY --from=build-infinistore /wheels/* wheels/