diff --git a/lmcache/Dockerfile b/lmcache/Dockerfile index 425225c..5f2860b 100644 --- a/lmcache/Dockerfile +++ b/lmcache/Dockerfile @@ -1,4 +1,4 @@ -ARG CUDA_VERSION=12.8.1 +ARG CUDA_VERSION=12.9.0 ARG IMAGE_DISTRO=ubuntu24.04 ARG PYTHON_VERSION=3.12 @@ -39,14 +39,13 @@ ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} # Install Python deps in venv RUN uv pip install numpy==2.0.0 -RUN uv pip install torch==2.7.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128 +RUN uv pip install torch==2.8.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 FROM base AS build-base # Install build tools and dependencies RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel -RUN echo "Hello" # Clone the repo, apply the patch, and build RUN git clone https://github.com/LMCache/LMCache.git -b v0.3.3 && \ cd LMCache && \ diff --git a/vllm/Dockerfile b/vllm/Dockerfile index c3cd5e6..8b60e5e 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -1,4 +1,4 @@ -ARG CUDA_VERSION=12.8.1 +ARG CUDA_VERSION=12.9.0 ARG IMAGE_DISTRO=ubuntu24.04 ARG PYTHON_VERSION=3.12 @@ -51,10 +51,7 @@ RUN apt-get update && apt install -y wget RUN uv pip install numpy==2.0.0 # Install pytorch nightly -RUN uv pip install torch==2.7.1+cu128 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128 --torch-backend=cu128 - -# Install from the wheel -# RUN uv pip install ./torch-2.7.0.dev20250310+cu128-cp312-cp312-linux_aarch64.whl +RUN uv pip install torch==2.8.0+cu129 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 --torch-backend=cu129 FROM base AS build-base RUN mkdir /wheels @@ -64,20 +61,20 @@ RUN mkdir /wheels RUN uv pip install -U build cmake ninja pybind11 setuptools wheel FROM build-base AS build-triton -ARG TRITON_REF=release/3.3.x -ARG TRITON_BUILD_SUFFIX=+cu128 +ARG TRITON_REF=release/3.4.x +ARG TRITON_BUILD_SUFFIX=+cu129 ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-} RUN git clone https://github.com/triton-lang/triton.git RUN cd triton && \ git checkout ${TRITON_REF} && \ git submodule sync && \ git submodule update --init --recursive -j 8 && \ - uv build python --wheel --no-build-isolation -o /wheels + uv build --wheel --no-build-isolation -o /wheels RUN export MAX_JOBS=6 FROM build-base AS build-xformers -ARG XFORMERS_REF=v0.0.30 -ARG XFORMERS_BUILD_VERSION=0.0.30+cu128 +ARG XFORMERS_REF=v0.0.32 +ARG XFORMERS_BUILD_VERSION=0.0.30+cu129 ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}} RUN git clone https://github.com/facebookresearch/xformers.git RUN cd xformers && \ @@ -87,26 +84,28 @@ RUN cd xformers && \ uv build --wheel --no-build-isolation -o /wheels # Currently not supported on CUDA 12.8 -# FROM build-base AS build-flashinfer -# ARG FLASHINFER_ENABLE_AOT=1 -# ARG FLASHINFER_REF=v0.2.2.post1 -# ARG FLASHINFER_BUILD_SUFFIX=cu126 -# ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} -# RUN git clone https://github.com/flashinfer-ai/flashinfer.git -# RUN cd flashinfer && \ -# git checkout ${FLASHINFER_REF} && \ -# git submodule sync && \ -# git submodule update --init --recursive -j 8 && \ -# uv build --wheel --no-build-isolation -o /wheels - -RUN git clone https://github.com/flashinfer-ai/flashinfer.git --recursive && \ - cd flashinfer && git checkout v0.2.8rc1 && \ - uv pip install ninja && \ - uv pip install --no-build-isolation --verbose . +FROM build-base AS build-flashinfer +ARG FLASHINFER_ENABLE_AOT=1 +ARG FLASHINFER_REF=v0.3.1 +ARG FLASHINFER_BUILD_SUFFIX=cu129 +ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} +RUN git clone https://github.com/flashinfer-ai/flashinfer.git +RUN cd flashinfer && \ + git checkout ${FLASHINFER_REF} && \ + git submodule sync && \ + git submodule update --init --recursive -j 8 && \ + uv build --wheel --no-build-isolation -o /wheels +FROM build-base AS build-lmcache +ARG LMCACHE_REF=v0.3.3 +RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \ + cd LMCache && \ + uv pip install setuptools_scm && \ + python -m build --wheel --no-isolation && \ + cp dist/*.whl /wheels/ FROM build-base AS build-vllm -ARG VLLM_REF=v0.10.0 +ARG VLLM_REF=v0.10.2 RUN git clone https://github.com/vllm-project/vllm.git RUN cd vllm && \ git checkout ${VLLM_REF} && \ @@ -116,14 +115,36 @@ RUN cd vllm && \ uv pip install -r requirements/build.txt && \ MAX_JOBS=16 uv build --wheel --no-build-isolation -o /wheels +# Build infinistore after vllm to avoid cache invalidation +FROM build-base AS build-infinistore +# Install additional dependencies needed for building infinistore on aarch64 +RUN apt update && apt install -y cmake pybind11-dev python3-dev libuv1-dev libspdlog-dev libboost-dev libboost-all-dev meson + +# Build flatbuffers from source with proper CMake version +RUN git clone -b v1.12.0 https://github.com/google/flatbuffers.git && \ + cd flatbuffers && \ + cmake -B build -DFLATBUFFERS_BUILD_TESTS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \ + cmake --build build -j && \ + cmake --install build + +# Build InfiniStore from source as a Python package +RUN git clone https://github.com/bytedance/InfiniStore && \ + cd InfiniStore && \ + uv pip install meson && \ + uv pip install --no-deps --no-build-isolation -e . && \ + uv pip uninstall infinistore && \ + python -m build --wheel --no-isolation && \ + cp dist/*.whl /wheels/ FROM base AS vllm-openai # COPY --from=build-flashinfer /wheels/* wheels/ COPY --from=build-triton /wheels/* wheels/ COPY --from=build-vllm /wheels/* wheels/ COPY --from=build-xformers /wheels/* wheels/ +COPY --from=build-lmcache /wheels/* wheels/ +COPY --from=build-infinistore /wheels/* wheels/ -# Install and cleanup wheels +# Install wheels (infinistore is now built as a wheel) RUN uv pip install wheels/* RUN rm -r wheels @@ -142,27 +163,17 @@ RUN export PATH="$(dirname $(realpath .venv/bin/python)):$PATH" # Install build tools and dependencies RUN uv pip install -U build cmake ninja pybind11 setuptools==79.0.1 wheel -# Clone and build LMCache wheel without Infinistore that is broken on aarch64 -# Copy the wheel from host to container -COPY lmcache-0.3.3-cp312-cp312-linux_aarch64.whl /tmp/ -RUN uv pip install /tmp/lmcache-0.3.3-cp312-cp312-linux_aarch64.whl --no-deps - # Enable hf-transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1 RUN uv pip install datasets aiohttp # Install nsys for profiling -ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/ -ARG NSYS_PKG=nsight-systems-cli-2025.3.1_2025.3.1.90-1_arm64.deb - +ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_5/ +ARG NSYS_PKG=nsight-systems-cli-2025.5.1_2025.5.1.121-1_arm64.deb RUN apt-get update && apt install -y wget libglib2.0-0 RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG RUN apt install -y --no-install-recommends tmux cmake -# Install required build tool -RUN uv pip install ninja - - # API server entrypoint # ENTRYPOINT ["vllm", "serve"] CMD ["/bin/bash"] diff --git a/vllm/README.md b/vllm/README.md index 8187ab9..a28da72 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -3,12 +3,9 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm) ```bash -docker login -docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 . - +sudo docker login # Alternative -sudo docker build --memory=300g --platform linux/arm64 -t rajesh550/gh200-vllm:0.10.1 . - -docker build --memory=300g -t rajesh550/gh200-vllm:0.9.0.1 . -docker push rajesh550/gh200-vllm:0.9.0.1 -``` +# docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 . +sudo docker build --memory=300g --platform linux/arm64 -t rajesh550/gh200-vllm:0.10.2 . +sudo docker push rajesh550/gh200-vllm:0.10.2 +``` \ No newline at end of file diff --git a/vllm/lmcache-0.3.1.dev1-cp312-cp312-linux_aarch64.whl b/vllm/lmcache-0.3.1.dev1-cp312-cp312-linux_aarch64.whl deleted file mode 100644 index 03bb8de..0000000 Binary files a/vllm/lmcache-0.3.1.dev1-cp312-cp312-linux_aarch64.whl and /dev/null differ diff --git a/vllm/lmcache-0.3.3-cp312-cp312-linux_aarch64.whl b/vllm/lmcache-0.3.3-cp312-cp312-linux_aarch64.whl deleted file mode 100644 index 6726ac1..0000000 Binary files a/vllm/lmcache-0.3.3-cp312-cp312-linux_aarch64.whl and /dev/null differ diff --git a/vllm/native_build.sh b/vllm/native_build.sh deleted file mode 100644 index ad80247..0000000 --- a/vllm/native_build.sh +++ /dev/null @@ -1,80 +0,0 @@ -curl -LsSf https://astral.sh/uv/install.sh | export UV_INSTALL_DIR=/usr/local/bin sh -mkdir vllm-build && cd vllm-build -uv vexport -p ${PYTHON_VERSION} --seed --python-preference only-managed -source .vexport/bin/activate - -export CUDA_VERSION=12.6.3 -export IMAGE_DISTRO=ubuntu24.04 -export PYTHON_VERSION=3.12 -export TORCH_CUDA_ARCH_LIST="9.0a" -export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} -export VLLM_FA_CMAKE_GPU_ARCHES="90a-real" -export VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES} - -uv pip install numpy==2.0.0 -uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126 -uv pip install build cmake ninja pybind11 setuptools wheel - -mkdir wheels - -# triton-lang -export TRITON_REF=release/3.2.x -export TRITON_BUILD_SUFFIX=+cu126 -export TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-} -git clone https://github.com/triton-lang/triton.git -cd triton && \ - git checkout ${TRITON_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ - uv build python --wheel --no-build-isolation -o ../wheels -cd .. - -# xformers -export MAX_JOBS=10 -export XFORMERS_REF=v0.0.29.post2 -export XFORMERS_BUILD_VERSION=0.0.29.post2+cu126 -export BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}} -git clone https://github.com/facebookresearch/xformers.git -cd xformers && \ - git checkout ${XFORMERS_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ - uv build --wheel --no-build-isolation -o ../wheels -cd .. - -# flashinfer -export FLASHINFER_ENABLE_AOT=1 -export FLASHINFER_REF=v0.2.2.post1 -export FLASHINFER_BUILD_SUFFIX=cu126 -export FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} -uv pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1 -git clone https://github.com/flashinfer-ai/flashinfer.git -cd flashinfer && \ - git checkout ${FLASHINFER_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ - uv build --wheel --no-build-isolation -o ../wheels && cd .. - -uv pip install wheels/* -uv pip install pynvml pandas accelerate hf_transfer modelscope bitsandbytes timm boto3 runai-model-streamer runai-model-streamer[s3] tensorizer - -git clone https://github.com/Dao-AILab/flash-attention.git && cd flash-attention -FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" MAX_JOBS=10 python setup.py install -cd hopper && FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" MAX_JOBS=5 python setup.py install - - - -# vllm -export VLLM_REF=v0.8.1 -export MAX_JOBS=4 -export CUDACXX=/home1/apps/nvidia/Linux_aarch64/24.9/cuda/12.6/bin/nvcc -git clone https://github.com/vllm-project/vllm.git -cd vllm && \ - git checkout ${VLLM_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ - uv pip install -r requirements/build.txt && \ - uv build --wheel --no-build-isolation -o ../wheels && cd .. - - -