diff --git a/README.md b/README.md index 1afb0d2..d5f1e93 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ This repository provides a Dockerfile to build a container with vLLM and all its If you prefer not to build the image yourself, you can pull the ready-to-use image directly from Docker Hub: -`docker pull rajesh550/gh200-vllm:0.10.2` +```bash +docker run --rm -it --gpus all -v "$PWD":"$PWD" -w "$PWD" rajesh550/gh200-vllm:0.11.0 bash +``` 👉 [Docker Hub](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm/general) diff --git a/vllm/Dockerfile b/vllm/Dockerfile index 78db96b..0f1428e 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base # 'a' suffix is not forward compatible but enables all optimizations ARG TORCH_CUDA_ARCH_LIST="9.0a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} -ENV UV_TORCH_BACKEND=cu128 +ENV UV_TORCH_BACKEND=cu129 ARG VLLM_FA_CMAKE_GPU_ARCHES="90a-real" ENV VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES} @@ -73,7 +73,7 @@ RUN cd triton && \ RUN export MAX_JOBS=6 FROM build-base AS build-xformers -ARG XFORMERS_REF=v0.0.32 +ARG XFORMERS_REF=v0.0.32.post2 ARG XFORMERS_BUILD_VERSION=0.0.30+cu129 ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}} RUN git clone https://github.com/facebookresearch/xformers.git @@ -86,7 +86,7 @@ RUN cd xformers && \ # Currently not supported on CUDA 12.8 FROM build-base AS build-flashinfer ARG FLASHINFER_ENABLE_AOT=1 -ARG FLASHINFER_REF=v0.3.1 +ARG FLASHINFER_REF=v0.4.1 ARG FLASHINFER_BUILD_SUFFIX=cu129 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} RUN git clone https://github.com/flashinfer-ai/flashinfer.git @@ -97,7 +97,7 @@ RUN cd flashinfer && \ uv build --wheel --no-build-isolation -o /wheels FROM build-base AS build-lmcache -ARG LMCACHE_REF=v0.3.3 +ARG LMCACHE_REF=v0.3.7 RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \ cd LMCache && \ uv pip install setuptools_scm && \ @@ -105,7 +105,7 @@ RUN git clone https://github.com/LMCache/LMCache.git -b ${LMCACHE_REF} && \ cp dist/*.whl /wheels/ FROM build-base AS build-vllm -ARG VLLM_REF=v0.10.2 +ARG VLLM_REF=v0.11.0 RUN git clone https://github.com/vllm-project/vllm.git RUN cd vllm && \ git checkout ${VLLM_REF} && \ @@ -137,7 +137,7 @@ RUN git clone https://github.com/bytedance/InfiniStore && \ cp dist/*.whl /wheels/ FROM base AS vllm-openai -# COPY --from=build-flashinfer /wheels/* wheels/ +COPY --from=build-flashinfer /wheels/* wheels/ COPY --from=build-triton /wheels/* wheels/ COPY --from=build-vllm /wheels/* wheels/ COPY --from=build-xformers /wheels/* wheels/ diff --git a/vllm/README.md b/vllm/README.md index a28da72..150f1cc 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -3,9 +3,9 @@ Hosted [here](https://hub.docker.com/repository/docker/rajesh550/gh200-vllm) ```bash -sudo docker login + docker login # Alternative # docker buildx build --platform linux/arm64 --memory=600g -t rajesh550/gh200-vllm:0.9.0.1 . -sudo docker build --memory=300g --platform linux/arm64 -t rajesh550/gh200-vllm:0.10.2 . -sudo docker push rajesh550/gh200-vllm:0.10.2 + docker build --memory=300g --platform linux/arm64 -t rajesh550/gh200-vllm:0.11.0 . + docker push rajesh550/gh200-vllm:0.11.0 ``` \ No newline at end of file