[XPU][NIXL] Add GPUDirect RDMA support for XPU (#35270)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2026-03-03 08:42:49 +08:00
parent c8b678e53e
commit 9dd656f0ea
3 changed files with 62 additions and 5 deletions
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -115,9 +115,57 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # install development dependencies (for testing)
 RUN uv pip install -e tests/vllm_test_utils

-# install nixl from source code
-ENV NIXL_VERSION=0.7.0
-RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+# install NIXL and UCX from source code
+ARG UCX_VERSION=e5d98879705239d254ede40b4a52891850cb5349
+ARG NIXL_VERSION=0.7.0
+
+RUN apt-get update && apt-get install -y \
+    pciutils \
+    net-tools \
+    iproute2 \
+    hwloc \
+    numactl \
+    wget \
+    curl \
+    git \
+    build-essential \
+    autoconf \
+    automake \
+    libtool \
+    pkg-config \
+    rdma-core \
+    libibverbs-dev \
+    ibverbs-utils \
+    libibverbs1 \
+    librdmacm-dev \
+    librdmacm1 \
+    libibumad-dev \
+    libibumad3 \
+    libibmad-dev \
+    libibmad5 \
+    infiniband-diags \
+    perftest \
+    ibutils \
+    libmlx5-1 \
+    libmlx4-1 \
+    ibverbs-providers \
+    librdmacm1t64
+
+ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
+ENV LD_LIBRARY_PATH=/tmp/ucx_install/lib:${LD_LIBRARY_PATH}
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/openucx/ucx /tmp/ucx_source && \
+    cd /tmp/ucx_source && git checkout "${UCX_VERSION}" && \
+    bash autogen.sh && \
+    ./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt && \
+    make CFLAGS="-Wno-error=incompatible-pointer-types" -j8 && make install && \
+    git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source && \
+    cd /tmp/nixl_source && git checkout "${NIXL_VERSION}" && \
+    cd /tmp/nixl_source && \
+    uv pip install --upgrade meson pybind11 patchelf && \
+    uv pip install -r requirements.txt && \
+    uv pip install . && \
+    rm -rf /tmp/ucx_source /tmp/nixl_source

 # FIX triton
 RUN --mount=type=cache,target=/root/.cache/uv \
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -135,7 +135,10 @@ _NIXL_SUPPORTED_DEVICE = {
        "cpu",
    ),
    "tpu": ("cpu",),
-    "xpu": ("cpu",),
+    "xpu": (
+        "cpu",
+        "xpu",
+    ),
    "cpu": ("cpu",),
 }
 # support for oot platform by providing mapping in current_platform
@@ -945,7 +948,7 @@ class NixlConnectorWorker:
        # type based on kv_buffer_device
        nixl_memory_type = current_platform.get_nixl_memory_type()
        if nixl_memory_type is None:
-            if self.kv_buffer_device == "cuda":
+            if self.kv_buffer_device in ["cuda", "xpu"]:
                nixl_memory_type = "VRAM"
            elif self.kv_buffer_device == "cpu":
                nixl_memory_type = "DRAM"
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -221,6 +221,12 @@ class XPUPlatform(Platform):
                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
            )

+        # In some cases, the internal memory type cache can misdetect GPU
+        # memory as host memory, also leading to invalid memory access.
+        # This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
+        # ref. https://openucx.readthedocs.io/en/master/faq.html
+        os.environ["UCX_MEMTYPE_CACHE"] = "n"
+
    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True