Skip xformers - vLLM has built-in FlashAttention kernels
xformers requires TORCH_STABLE_ONLY which needs torch/csrc/stable/ headers not present in PyTorch 2.9.0. vLLM 0.18.1 includes its own FA2/FA3 kernels.
This commit is contained in:
@@ -69,15 +69,14 @@ FROM build-base AS build-triton
|
||||
RUN mkdir -p /wheels && \
|
||||
pip download triton==3.6.0 --platform manylinux_2_27_aarch64 --only-binary=:all: --no-deps -d /wheels
|
||||
|
||||
FROM build-base AS build-xformers
|
||||
#ARG XFORMERS_REF=v0.0.32.post2
|
||||
#ARG XFORMERS_BUILD_VERSION=0.0.30+cu130
|
||||
#ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
|
||||
RUN git clone https://github.com/facebookresearch/xformers.git
|
||||
RUN cd xformers && \
|
||||
git submodule sync && \
|
||||
git submodule update --init --recursive -j 8 && \
|
||||
MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
|
||||
# Skip xformers - vLLM has built-in FlashAttention kernels
|
||||
# xformers requires TORCH_STABLE_ONLY which needs PyTorch headers not in 2.9.0
|
||||
# FROM build-base AS build-xformers
|
||||
# RUN git clone https://github.com/facebookresearch/xformers.git
|
||||
# RUN cd xformers && \
|
||||
# git submodule sync && \
|
||||
# git submodule update --init --recursive -j 8 && \
|
||||
# MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
|
||||
|
||||
FROM build-base AS build-flashinfer
|
||||
ARG FLASHINFER_ENABLE_AOT=1
|
||||
@@ -173,7 +172,6 @@ COPY --from=build-flash-attention /wheels/* wheels/
|
||||
COPY --from=build-flashinfer /wheels/* wheels/
|
||||
COPY --from=build-triton /wheels/* wheels/
|
||||
COPY --from=build-vllm /wheels/* wheels/
|
||||
COPY --from=build-xformers /wheels/* wheels/
|
||||
COPY --from=build-lmcache /wheels/* wheels/
|
||||
COPY --from=build-infinistore /wheels/* wheels/
|
||||
|
||||
|
||||
Reference in New Issue
Block a user