Fix xformers TORCH_STABLE_ONLY issue + ramp up MAX_JOBS for native GH200

- Switch to official facebook/xformers (johnnynunez fork has TORCH_STABLE_ONLY requiring PyTorch headers not in 2.9.0)
- Increase MAX_JOBS from 2-4 to 8 for all builds (native GH200 has 97GB HBM3)
- Increase NVCC_THREADS from 1 to 4 for flash-attention
This commit is contained in:
2026-04-03 05:46:11 +00:00
parent b223c051de
commit 45b6109ee1

View File

@@ -73,13 +73,11 @@ FROM build-base AS build-xformers
#ARG XFORMERS_REF=v0.0.32.post2 #ARG XFORMERS_REF=v0.0.32.post2
#ARG XFORMERS_BUILD_VERSION=0.0.30+cu130 #ARG XFORMERS_BUILD_VERSION=0.0.30+cu130
#ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}} #ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
RUN git clone https://github.com/johnnynunez/xformers.git RUN git clone https://github.com/facebookresearch/xformers.git
# https://github.com/facebookresearch/xformers.git
RUN cd xformers && \ RUN cd xformers && \
# git checkout ${XFORMERS_REF} && \
git submodule sync && \ git submodule sync && \
git submodule update --init --recursive -j 8 && \ git submodule update --init --recursive -j 8 && \
MAX_JOBS=3 uv build --wheel --no-build-isolation -o /wheels MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels
FROM build-base AS build-flashinfer FROM build-base AS build-flashinfer
ARG FLASHINFER_ENABLE_AOT=1 ARG FLASHINFER_ENABLE_AOT=1
@@ -101,7 +99,7 @@ RUN git clone https://github.com/LMCache/LMCache.git && \
# PyTorch version is dated in LMCache # PyTorch version is dated in LMCache
sed -i '/torch/d' pyproject.toml && \ sed -i '/torch/d' pyproject.toml && \
uv pip install setuptools_scm && \ uv pip install setuptools_scm && \
MAX_JOBS=2 python -m build --wheel --no-isolation && \ MAX_JOBS=8 python -m build --wheel --no-isolation && \
cp dist/*.whl /wheels/ cp dist/*.whl /wheels/
@@ -110,8 +108,8 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \
git clone https://github.com/Dao-AILab/flash-attention flash-attention && \ git clone https://github.com/Dao-AILab/flash-attention flash-attention && \
cd flash-attention/hopper && \ cd flash-attention/hopper && \
mkdir wheels && \ mkdir wheels && \
export MAX_JOBS=4 && \ export MAX_JOBS=8 && \
export NVCC_THREADS=1 && \ export NVCC_THREADS=4 && \
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
MAX_JOBS=$MAX_JOBS \ MAX_JOBS=$MAX_JOBS \
CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \ CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS \
@@ -137,7 +135,7 @@ RUN cd vllm && \
git submodule sync && \ git submodule sync && \
git submodule update --init --recursive -j 8 && \ git submodule update --init --recursive -j 8 && \
sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \ sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
export MAX_JOBS=4 && \ export MAX_JOBS=8 && \
export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
python use_existing_torch.py && \ python use_existing_torch.py && \
uv pip install -r requirements/build.txt && \ uv pip install -r requirements/build.txt && \