diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 30dd4449f..0eb64e566 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1453,8 +1453,8 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh ##### multi gpus test ##### ##### A100 test ##### diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index c5e94ee1f..b37f76faf 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -12,6 +12,17 @@ ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG AITER_BRANCH="6af8b687" ARG AITER_REPO="https://github.com/ROCm/aiter.git" +#TODO: When patch has been upstreamed, switch to the main repo/branch +# ARG RIXL_BRANCH="" +# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git" +ARG RIXL_BRANCH="50d63d94" +ARG RIXL_REPO="https://github.com/vcave/RIXL.git" +# Needed by RIXL +ARG ETCD_BRANCH="7c6e714f" +ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git" +ARG UCX_BRANCH="da3fac2a" +ARG UCX_REPO="https://github.com/ROCm/ucx.git" + FROM ${BASE_IMAGE} AS base ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin @@ -50,6 +61,10 @@ RUN apt-get update -y \ RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/* + +### +### Triton Build +### FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO @@ -62,11 +77,19 @@ RUN cd triton \ RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \ && python3 -m build --wheel && cp dist/*.whl /app/install; fi + +### +### AMD SMI Build +### FROM base AS build_amdsmi RUN cd /opt/rocm/share/amd_smi \ && pip wheel . --wheel-dir=dist RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install + +### +### Pytorch build +### FROM base AS build_pytorch ARG PYTORCH_BRANCH ARG PYTORCH_VISION_BRANCH @@ -95,6 +118,96 @@ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \ && cp /app/audio/dist/*.whl /app/install + +### +### RIXL Build +### +FROM build_pytorch AS build_rixl +ARG RIXL_BRANCH +ARG RIXL_REPO +ARG ETCD_BRANCH +ARG ETCD_REPO +ARG UCX_BRANCH +ARG UCX_REPO + +ENV ROCM_PATH=/opt/rocm +ENV UCX_HOME=/usr/local/ucx +ENV RIXL_HOME=/usr/local/rixl +ENV RIXL_BENCH_HOME=/usr/local/rixl_bench + +# RIXL build system dependences and RDMA support +RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \ + libgrpc-dev \ + libgrpc++-dev \ + libprotobuf-dev \ + protobuf-compiler-grpc \ + libcpprest-dev \ + libaio-dev \ + librdmacm1 \ + librdmacm-dev \ + libibverbs1 \ + libibverbs-dev \ + ibverbs-utils \ + rdmacm-utils \ + ibverbs-providers + +RUN pip install meson auditwheel patchelf tomlkit + +WORKDIR /workspace + +RUN git clone ${ETCD_REPO} && \ + cd etcd-cpp-apiv3 && \ + git checkout ${ETCD_BRANCH} && \ + mkdir build && cd build && \ + cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \ + make -j$(nproc) && \ + make install + +RUN cd /usr/local/src && \ + git clone ${UCX_REPO} && \ + cd ucx && \ + git checkout ${UCX_BRANCH} && \ + ./autogen.sh && \ + mkdir build && cd build && \ + ../configure \ + --prefix=/usr/local/ucx \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-devel-headers \ + --with-rocm=/opt/rocm \ + --with-verbs \ + --with-dm \ + --enable-mt && \ + make -j && \ + make -j install + +ENV PATH=/usr/local/ucx/bin:$PATH +ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH} + +RUN git clone ${RIXL_REPO} /opt/rixl && \ + cd /opt/rixl && \ + git checkout ${RIXL_BRANCH} && \ + meson setup build --prefix=${RIXL_HOME} \ + -Ducx_path=${UCX_HOME} \ + -Drocm_path=${ROCM_PATH} && \ + cd build && \ + ninja && \ + ninja install + +# Generate RIXL wheel +RUN cd /opt/rixl && mkdir -p /app/install && \ + ./contrib/build-wheel.sh \ + --output-dir /app/install \ + --rocm-dir ${ROCM_PATH} \ + --ucx-plugins-dir ${UCX_HOME}/lib/ucx \ + --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins + + +### +### FlashAttention Build +### FROM base AS build_fa ARG FA_BRANCH ARG FA_REPO @@ -107,6 +220,10 @@ RUN cd flash-attention \ && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install + +### +### AITER Build +### FROM base AS build_aiter ARG AITER_BRANCH ARG AITER_REPO @@ -120,6 +237,10 @@ RUN cd aiter \ RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install + +### +### Final Build +### FROM base AS debs RUN mkdir /app/debs RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ @@ -132,6 +253,8 @@ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ cp /install/*.whl /app/debs RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs FROM base AS final RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \ @@ -150,6 +273,12 @@ ARG FA_BRANCH ARG FA_REPO ARG AITER_BRANCH ARG AITER_REPO +ARG RIXL_BRANCH +ARG RIXL_REPO +ARG ETCD_BRANCH +ARG ETCD_REPO +ARG UCX_BRANCH +ARG UCX_REPO RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ @@ -162,4 +291,10 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ + && echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \ + && echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \ + && echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \ + && echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \ + && echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \ + && echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 601205e1e..8aa23b24a 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -6,11 +6,17 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag ### Installation -Install the NIXL library: `uv pip install nixl`, as a quick start. +Install the NIXL library: `uv pip install nixl`, as a quick start on Nvidia platform. - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions - The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files +For ROCm platform, the [base ROCm docker file](../../docker/Dockerfile.rocm_base) includes RIXL and ucx already. + +- Refer to [RIXL official repository](https://github.com/rocm/rixl) for more information +- The supportive libraries for RIXL can be found in [requirements/kv_connectors_rocm.txt](../../requirements/kv_connectors_rocm.txt) +- In the future we may remove RIXL from docker image file and users will be able to install from pre-compiled binary packages + For non-cuda platform, please install nixl with ucx build from source, instructed as below. ```bash diff --git a/requirements/kv_connectors_rocm.txt b/requirements/kv_connectors_rocm.txt new file mode 100644 index 000000000..604b96ec5 --- /dev/null +++ b/requirements/kv_connectors_rocm.txt @@ -0,0 +1,2 @@ +tblib +lm_eval[api] \ No newline at end of file diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 757ca41e9..baeafc08d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -84,8 +84,12 @@ logger = init_logger(__name__) # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used try: - from nixl._api import nixl_agent as NixlWrapper - from nixl._bindings import nixlXferTelemetry + if not current_platform.is_rocm(): + from nixl._api import nixl_agent as NixlWrapper + from nixl._bindings import nixlXferTelemetry + else: + from rixl._api import nixl_agent as NixlWrapper + from rixl._bindings import nixlXferTelemetry logger.info("NIXL is available") except ImportError: @@ -95,7 +99,10 @@ except ImportError: try: - from nixl._api import nixl_agent_config + if not current_platform.is_rocm(): + from nixl._api import nixl_agent_config + else: + from rixl._api import nixl_agent_config except ImportError: nixl_agent_config = None logger.warning("NIXL agent config is not available")