From a399fbc8c6ea982e1e7cc4d7183b681a165ed84b Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 3 Apr 2026 02:49:43 +0000
Subject: [PATCH] Add MAX_JOBS=2 for LMCache, restore vLLM build from source

- LMCache: reduced parallelism to avoid memory pressure
- vLLM: restored build from source (was using PyPI wheel)
- Will test with docker --memory=24g limit
---
 vllm/Dockerfile | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/vllm/Dockerfile b/vllm/Dockerfile
index 4592cdc..0f825af 100644
--- a/vllm/Dockerfile
+++ b/vllm/Dockerfile
@@ -100,7 +100,7 @@ RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \
 # PyTorch version is dated in LMCache
     sed -i '/torch/d' pyproject.toml && \
     uv pip install setuptools_scm && \
-    python -m build --wheel --no-isolation && \
+    MAX_JOBS=2 python -m build --wheel --no-isolation && \
     cp dist/*.whl /wheels/
 
 
@@ -121,33 +121,31 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \
     cp wheels/*.whl /wheels/
 
 # ==============================================================================
-# NOTE: Using PyPI vLLM wheel instead of building from source
-# Reason: QEMU cmake try_compile fails during compiler ABI detection
-# PyPI wheel v0.18.1 includes pre-compiled FA2, FA3, MoE kernels for aarch64
+# NOTE: Temporarily using PyPI vLLM wheel for QEMU testing
 # To restore native build on GH200, uncomment the block below and comment out
 # the PyPI download section.
 # ==============================================================================
-# FROM build-base AS build-vllm
-# ARG VLLM_REF=v0.11.1rc2
-# # Install ccache for faster compilation
-# RUN apt-get update && apt-get install -y ccache
-# RUN git clone https://github.com/vllm-project/vllm.git
-# RUN cd vllm && \
-#     git checkout ${VLLM_REF} && \
-#     git submodule sync && \
-#     git submodule update --init --recursive -j 8 && \
-#     sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
-#     export MAX_JOBS=4 && \
-#     export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
-#     python use_existing_torch.py && \
-#     uv pip install -r requirements/build.txt && \
-#     CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
+FROM build-base AS build-vllm
+ARG VLLM_REF=v0.18.1
+# Install ccache for faster compilation
+RUN apt-get update && apt-get install -y ccache
+RUN git clone https://github.com/vllm-project/vllm.git
+RUN cd vllm && \
+    git checkout ${VLLM_REF} && \
+    git submodule sync && \
+    git submodule update --init --recursive -j 8 && \
+    sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
+    export MAX_JOBS=4 && \
+    export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
+    python use_existing_torch.py && \
+    uv pip install -r requirements/build.txt && \
+    CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
 
 # Use PyPI vLLM wheel (QEMU cmake fails during try_compile)
-FROM build-base AS build-vllm
-ARG VLLM_VERSION=0.18.1
-RUN mkdir -p /wheels && \
-    pip download vllm==${VLLM_VERSION} --platform manylinux_2_31_aarch64 --only-binary=:all: --no-deps -d /wheels
+# FROM build-base AS build-vllm
+# ARG VLLM_VERSION=0.18.1
+# RUN mkdir -p /wheels && \
+#     pip download vllm==${VLLM_VERSION} --platform manylinux_2_31_aarch64 --only-binary=:all: --no-deps -d /wheels
 
 # Build infinistore after vllm to avoid cache invalidation
 FROM build-base AS build-infinistore