Use PyPI vLLM wheel instead of building (QEMU cmake try_compile fails)

- vLLM 0.18.1 aarch64 wheel includes pre-compiled FA2, FA3, MoE kernels - Original build-from-source code commented out for GH200 restoration - CMake compiler ABI detection fails under QEMU emulation
2026-04-03 00:05:56 +00:00
parent 436214bb72
commit f8a9d372e5
1 changed files with 27 additions and 14 deletions
--- a/vllm/Dockerfile
+++ b/vllm/Dockerfile
@@ -120,21 +120,34 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \
    pip wheel . -v --no-deps --no-build-isolation -w ./wheels/ && \
    cp wheels/*.whl /wheels/
 # ==============================================================================
 # NOTE: Using PyPI vLLM wheel instead of building from source
 # Reason: QEMU cmake try_compile fails during compiler ABI detection
 # PyPI wheel v0.18.1 includes pre-compiled FA2, FA3, MoE kernels for aarch64
 # To restore native build on GH200, uncomment the block below and comment out
 # the PyPI download section.
 # ==============================================================================
 # FROM build-base AS build-vllm
 # ARG VLLM_REF=v0.11.1rc2
 # # Install ccache for faster compilation
 # RUN apt-get update && apt-get install -y ccache
 # RUN git clone https://github.com/vllm-project/vllm.git
 # RUN cd vllm && \
 #     git checkout ${VLLM_REF} && \
 #     git submodule sync && \
 #     git submodule update --init --recursive -j 8 && \
 #     sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
 #     export MAX_JOBS=4 && \
 #     export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
 #     python use_existing_torch.py && \
 #     uv pip install -r requirements/build.txt && \
 #     CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
 # Use PyPI vLLM wheel (QEMU cmake fails during try_compile)
 FROM build-base AS build-vllm
-ARG VLLM_REF=v0.11.1rc2
+ARG VLLM_VERSION=0.18.1
-# Install ccache for faster compilation
+RUN mkdir -p /wheels && \
-RUN apt-get update && apt-get install -y ccache
+    pip download vllm==${VLLM_VERSION} --platform manylinux_2_31_aarch64 --only-binary=:all: --no-deps -d /wheels
 RUN git clone https://github.com/vllm-project/vllm.git
 RUN cd vllm && \
    git checkout ${VLLM_REF} && \
    git submodule sync && \
    git submodule update --init --recursive -j 8 && \
    sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \
    export MAX_JOBS=4 && \
    export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \
    python use_existing_torch.py && \
    uv pip install -r requirements/build.txt && \
    CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels
 # Build infinistore after vllm to avoid cache invalidation
 FROM build-base AS build-infinistore