From e5445512aa0b7b04fb77c8c0d9547e00eba9c7cc Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 2 Apr 2026 23:44:11 +0000 Subject: [PATCH] Reduce MAX_JOBS by half to reduce QEMU memory pressure - xformers: 6 -> 3 - flash-attention: 8 -> 4 - vllm: 8 -> 4 Testing if lower parallelism helps avoid segfaults under emulation --- vllm/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/Dockerfile b/vllm/Dockerfile index 20effe3..b49a74c 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -85,7 +85,7 @@ RUN cd xformers && \ # git checkout ${XFORMERS_REF} && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ - MAX_JOBS=6 uv build --wheel --no-build-isolation -o /wheels + MAX_JOBS=3 uv build --wheel --no-build-isolation -o /wheels FROM build-base AS build-flashinfer ARG FLASHINFER_ENABLE_AOT=1 @@ -115,7 +115,7 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \ git clone https://github.com/Dao-AILab/flash-attention flash-attention && \ cd flash-attention/hopper && \ mkdir wheels && \ - export MAX_JOBS=8 && \ + export MAX_JOBS=4 && \ export NVCC_THREADS=1 && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ MAX_JOBS=$MAX_JOBS \ @@ -136,7 +136,7 @@ RUN cd vllm && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \ - export MAX_JOBS=8 && \ + export MAX_JOBS=4 && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ python use_existing_torch.py && \ uv pip install -r requirements/build.txt && \