Skip xformers - vLLM has built-in FlashAttention kernels

xformers requires TORCH_STABLE_ONLY which needs torch/csrc/stable/ headers
not present in PyTorch 2.9.0. vLLM 0.18.1 includes its own FA2/FA3 kernels.
This commit is contained in:
2026-04-03 05:50:02 +00:00
parent 45b6109ee1
commit 9d88d4c7d8

View File

@@ -69,15 +69,14 @@ FROM build-base AS build-triton
RUN mkdir -p /wheels && \
pip download triton==3.6.0 --platform manylinux_2_27_aarch64 --only-binary=:all: --no-deps -d /wheels
FROM build-base AS build-xformers
#ARG XFORMERS_REF=v0.0.32.post2
#ARG XFORMERS_BUILD_VERSION=0.0.30+cu130
#ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
RUN git clone https://github.com/facebookresearch/xformers.git
RUN cd xformers && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
# Skip xformers - vLLM has built-in FlashAttention kernels
# xformers requires TORCH_STABLE_ONLY which needs PyTorch headers not in 2.9.0
# FROM build-base AS build-xformers
# RUN git clone https://github.com/facebookresearch/xformers.git
# RUN cd xformers && \
# git submodule sync && \
# git submodule update --init --recursive -j 8 && \
# MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
FROM build-base AS build-flashinfer
ARG FLASHINFER_ENABLE_AOT=1
@@ -173,7 +172,6 @@ COPY --from=build-flash-attention /wheels/* wheels/
COPY --from=build-flashinfer /wheels/* wheels/
COPY --from=build-triton /wheels/* wheels/
COPY --from=build-vllm /wheels/* wheels/
COPY --from=build-xformers /wheels/* wheels/
COPY --from=build-lmcache /wheels/* wheels/
COPY --from=build-infinistore /wheels/* wheels/