From c92c4ec68a99c1bb51b6b9a7543df8e6f3a38abf Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 3 Apr 2026 08:44:36 +0000 Subject: [PATCH] Switch to NVIDIA NGC PyTorch 26.03 base image (PyTorch 2.11.0a0, CUDA 13.2.0, ARM SBSA support) --- lmcache/Dockerfile | 42 +++++++++----------- vllm/Dockerfile | 97 ++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 79 deletions(-) diff --git a/lmcache/Dockerfile b/lmcache/Dockerfile index 24ba7d9..1701b23 100644 --- a/lmcache/Dockerfile +++ b/lmcache/Dockerfile @@ -1,10 +1,12 @@ -ARG CUDA_VERSION=12.9.0 -ARG IMAGE_DISTRO=ubuntu24.04 -ARG PYTHON_VERSION=3.12 - # ---------- Builder Base ---------- -FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base +# Using NVIDIA NGC PyTorch container (26.03) with: +# - PyTorch 2.11.0a0 (bleeding edge) +# - CUDA 13.2.0 +# - cuDNN 9.20, NCCL 2.29.7, TensorRT 10.16, TransformerEngine 2.13 +# - Multi-arch: x86 + ARM SBSA (GH200 support) +FROM nvcr.io/nvidia/pytorch:26.03 AS base +# Set arch lists for all targets ARG TORCH_CUDA_ARCH_LIST="9.0a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV DEBIAN_FRONTEND=noninteractive @@ -26,37 +28,29 @@ ENV CXX=/usr/bin/g++ # Install uv RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh -# Set up workspace and virtualenv +# Set up workspace WORKDIR /workspace -ARG PYTHON_VERSION -RUN uv venv -p ${PYTHON_VERSION} --seed --python-preference only-managed -# Activate uv venv -ENV VIRTUAL_ENV=/workspace/.venv -ENV PATH=${VIRTUAL_ENV}/bin:${PATH} +# Environment setup (PyTorch container already has CUDA paths set) ENV CUDA_HOME=/usr/local/cuda ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} -# Install Python deps in venv -RUN uv pip install numpy==2.0.0 -# Install PyTorch nightly with CUDA 13.0 (bleeding edge) -RUN uv pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 - FROM base AS build-base # Install build tools and dependencies -RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel +RUN pip install -U build cmake ninja pybind11 "setuptools>=77.0.3,<81.0.0" wheel # Clone the repo, apply the patch, and build -RUN git clone https://github.com/LMCache/LMCache.git -b v0.3.3 && \ +RUN git clone https://github.com/LMCache/LMCache.git && \ cd LMCache && \ - uv pip install setuptools_scm && \ - python -m build --wheel --no-isolation && \ - cp dist/*.whl /workspace/ -CMD ["/bin/bash"]">>> COMMIT: $(git rev-parse HEAD)" && \ + git checkout dev && \ + echo "\n\n========================================" && \ + echo ">>> BUILDING LMCACHE FROM:" && \ + echo ">>> BRANCH: $(git rev-parse --abbrev-ref HEAD)" && \ + echo ">>> COMMIT: $(git rev-parse HEAD)" && \ echo ">>> DATE: $(git log -1 --format=%cd --date=short)" && \ echo "========================================\n\n" && \ - uv pip install setuptools_scm && \ + pip install setuptools_scm && \ python -m build --wheel --no-isolation && \ cp dist/*.whl /workspace/ -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/vllm/Dockerfile b/vllm/Dockerfile index 95e57b8..73f3a83 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -1,68 +1,56 @@ -ARG CUDA_VERSION=12.8.1 -ARG IMAGE_DISTRO=ubuntu24.04 -ARG PYTHON_VERSION=3.12 - # ---------- Builder Base ---------- -FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base +# Using NVIDIA NGC PyTorch container (26.03) with: +# - PyTorch 2.11.0a0 (bleeding edge) +# - CUDA 13.2.0 +# - cuDNN 9.20, NCCL 2.29.7, TensorRT 10.16, TransformerEngine 2.13 +# - Multi-arch: x86 + ARM SBSA (GH200 support) +FROM nvcr.io/nvidia/pytorch:26.03 AS base # Set arch lists for all targets # 'a' suffix is not forward compatible but enables all optimizations ARG TORCH_CUDA_ARCH_LIST="9.0a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} -ENV UV_TORCH_BACKEND=cu128 ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real" ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES} -# Update apt packages and install dependencies +# Install additional build dependencies ENV DEBIAN_FRONTEND=noninteractive -RUN apt update -RUN apt upgrade -y -RUN apt install -y --no-install-recommends \ +RUN apt update && apt install -y --no-install-recommends \ curl \ git \ libibverbs-dev \ zlib1g-dev \ - libnuma-dev - -# Clean apt cache -RUN apt clean -RUN rm -rf /var/lib/apt/lists/* -RUN rm -rf /var/cache/apt/archives + libnuma-dev \ + wget \ + && apt clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives # Set compiler paths ENV CC=/usr/bin/gcc ENV CXX=/usr/bin/g++ -ENV QEMU_CPU=max -# Install uv +# Install uv for faster package management RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh # Setup build workspace WORKDIR /workspace -# Prep build venv -ARG PYTHON_VERSION -RUN uv venv -p ${PYTHON_VERSION} --seed --python-preference only-managed -ENV VIRTUAL_ENV=/workspace/.venv -ENV PATH=${VIRTUAL_ENV}/bin:${PATH} +# Environment setup (PyTorch container already has CUDA paths set) ENV CUDA_HOME=/usr/local/cuda ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} ENV CPLUS_INCLUDE_PATH=${CUDA_HOME}/include/cccl ENV C_INCLUDE_PATH=${CUDA_HOME}/include/cccl ENV PATH=${CUDA_HOME}/cuda/bin:${PATH} -RUN apt-get update && apt install -y wget - -RUN uv pip install numpy==2.0.0 -# Install PyTorch nightly with CUDA 13.0 (bleeding edge) -RUN uv pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 +# Use the Python environment from the container +# The NGC container already has a working Python/PyTorch setup FROM base AS build-base RUN mkdir /wheels # Install build deps that aren't in project requirements files # Pin setuptools to <81 for LMCache compatibility (needs >=77.0.3,<81.0.0) -RUN uv pip install -U build cmake ninja pybind11 "setuptools>=77.0.3,<81.0.0" wheel +RUN pip install -U build cmake ninja pybind11 "setuptools>=77.0.3,<81.0.0" wheel # Use PyPI triton wheel instead of building (QEMU segfaults during triton build) FROM build-base AS build-triton @@ -76,19 +64,19 @@ RUN mkdir -p /wheels && \ # RUN cd xformers && \ # git submodule sync && \ # git submodule update --init --recursive -j 8 && \ -# MAX_JOBS=8 uv build --wheel --no-build-isolation -o /wheels +# MAX_JOBS=8 pip build --wheel --no-build-isolation -o /wheels FROM build-base AS build-flashinfer ARG FLASHINFER_ENABLE_AOT=1 ARG FLASHINFER_REF=v0.6.6 -ARG FLASHINFER_BUILD_SUFFIX=cu130 +ARG FLASHINFER_BUILD_SUFFIX=cu132 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} RUN git clone https://github.com/flashinfer-ai/flashinfer.git RUN cd flashinfer && \ git checkout ${FLASHINFER_REF} && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ - uv build --wheel --no-build-isolation -o /wheels + pip build --wheel --no-build-isolation -o /wheels FROM build-base AS build-lmcache # Bleeding edge: build from dev branch (v0.4.2+) @@ -102,7 +90,7 @@ RUN git clone https://github.com/LMCache/LMCache.git && \ echo ">>> DATE: $(git log -1 --format=%cd --date=short)" && \ echo "========================================\n\n" && \ sed -i '/torch/d' pyproject.toml && \ - uv pip install setuptools_scm && \ + pip install setuptools_scm && \ MAX_JOBS=8 python -m build --wheel --no-isolation && \ cp dist/*.whl /wheels/ @@ -124,9 +112,7 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \ cp wheels/*.whl /wheels/ # ============================================================================== -# NOTE: Temporarily using PyPI vLLM wheel for QEMU testing -# To restore native build on GH200, uncomment the block below and comment out -# the PyPI download section. +# Build vLLM from source # ============================================================================== FROM build-base AS build-vllm # Bleeding edge: build from main branch @@ -136,20 +122,21 @@ RUN apt-get update && apt-get install -y ccache RUN git clone https://github.com/vllm-project/vllm.git RUN cd vllm && \ git checkout ${VLLM_REF} && \ + echo "\n\n========================================" && \ + echo ">>> BUILDING VLLM FROM:" && \ + echo ">>> BRANCH: $(git rev-parse --abbrev-ref HEAD)" && \ + echo ">>> COMMIT: $(git rev-parse HEAD)" && \ + echo ">>> DATE: $(git log -1 --format=%cd --date=short)" && \ + echo ">>> TAG: $(git describe --tags --always 2>/dev/null || echo 'no tag')" && \ + echo "========================================\n\n" && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ sed -i 's/GIT_TAG [a-f0-9]\{40\}/GIT_TAG main/' cmake/external_projects/vllm_flash_attn.cmake && \ export MAX_JOBS=8 && \ export CMAKE_BUILD_PARALLEL_LEVEL=$MAX_JOBS && \ python use_existing_torch.py && \ - uv pip install -r requirements/build.txt && \ - CCACHE_NOHASHDIR="true" uv build --wheel --no-build-isolation -o /wheels - -# Use PyPI vLLM wheel (QEMU cmake fails during try_compile) -# FROM build-base AS build-vllm -# ARG VLLM_VERSION=0.18.1 -# RUN mkdir -p /wheels && \ -# pip download vllm==${VLLM_VERSION} --platform manylinux_2_31_aarch64 --only-binary=:all: --no-deps -d /wheels + pip install -r requirements/build.txt && \ + CCACHE_NOHASHDIR="true" pip build --wheel --no-build-isolation -o /wheels # Build infinistore after vllm to avoid cache invalidation FROM build-base AS build-infinistore @@ -166,9 +153,9 @@ RUN git clone -b v1.12.0 https://github.com/google/flatbuffers.git && \ # Build InfiniStore from source as a Python package RUN git clone https://github.com/bytedance/InfiniStore && \ cd InfiniStore && \ - uv pip install meson && \ - uv pip install --no-deps --no-build-isolation -e . && \ - uv pip uninstall infinistore && \ + pip install meson && \ + pip install --no-deps --no-build-isolation -e . && \ + pip uninstall infinistore && \ python -m build --wheel --no-isolation && \ cp dist/*.whl /wheels/ @@ -181,25 +168,25 @@ COPY --from=build-lmcache /wheels/* wheels/ COPY --from=build-infinistore /wheels/* wheels/ # Install wheels (infinistore is now built as a wheel) -RUN uv pip install wheels/* +RUN pip install wheels/* RUN rm -r wheels # Install pynvml -RUN uv pip install pynvml pandas +RUN pip install pynvml pandas # Add additional packages for vLLM OpenAI # Bleeding edge: latest transformers -RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 runai-model-streamer runai-model-streamer[s3] tensorizer transformers --upgrade +RUN pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 runai-model-streamer runai-model-streamer[s3] tensorizer transformers --upgrade -# Clean uv cache -RUN uv clean +# Clean pip cache +RUN pip cache purge || true # Install build tools and dependencies -RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel +RUN pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel # Enable hf-transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1 -RUN uv pip install datasets aiohttp +RUN pip install datasets aiohttp # Install nsys for profiling ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_5/ @@ -209,7 +196,7 @@ RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG RUN apt install -y --no-install-recommends tmux cmake # Deprecated cleanup -RUN uv pip uninstall pynvml && uv pip install nvidia-ml-py +RUN pip uninstall pynvml && pip install nvidia-ml-py # API server entrypoint # ENTRYPOINT ["vllm", "serve"]