[XPU][NIXL] Add GPUDirect RDMA support for XPU (#35270)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
liuzhenwei
2026-03-03 08:42:49 +08:00
committed by GitHub
parent c8b678e53e
commit 9dd656f0ea
3 changed files with 62 additions and 5 deletions

View File

@@ -115,9 +115,57 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# install development dependencies (for testing)
RUN uv pip install -e tests/vllm_test_utils
# install nixl from source code
ENV NIXL_VERSION=0.7.0
RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
# install NIXL and UCX from source code
ARG UCX_VERSION=e5d98879705239d254ede40b4a52891850cb5349
ARG NIXL_VERSION=0.7.0
RUN apt-get update && apt-get install -y \
pciutils \
net-tools \
iproute2 \
hwloc \
numactl \
wget \
curl \
git \
build-essential \
autoconf \
automake \
libtool \
pkg-config \
rdma-core \
libibverbs-dev \
ibverbs-utils \
libibverbs1 \
librdmacm-dev \
librdmacm1 \
libibumad-dev \
libibumad3 \
libibmad-dev \
libibmad5 \
infiniband-diags \
perftest \
ibutils \
libmlx5-1 \
libmlx4-1 \
ibverbs-providers \
librdmacm1t64
ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
ENV LD_LIBRARY_PATH=/tmp/ucx_install/lib:${LD_LIBRARY_PATH}
RUN --mount=type=cache,target=/root/.cache/uv \
git clone https://github.com/openucx/ucx /tmp/ucx_source && \
cd /tmp/ucx_source && git checkout "${UCX_VERSION}" && \
bash autogen.sh && \
./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt && \
make CFLAGS="-Wno-error=incompatible-pointer-types" -j8 && make install && \
git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source && \
cd /tmp/nixl_source && git checkout "${NIXL_VERSION}" && \
cd /tmp/nixl_source && \
uv pip install --upgrade meson pybind11 patchelf && \
uv pip install -r requirements.txt && \
uv pip install . && \
rm -rf /tmp/ucx_source /tmp/nixl_source
# FIX triton
RUN --mount=type=cache,target=/root/.cache/uv \

View File

@@ -135,7 +135,10 @@ _NIXL_SUPPORTED_DEVICE = {
"cpu",
),
"tpu": ("cpu",),
"xpu": ("cpu",),
"xpu": (
"cpu",
"xpu",
),
"cpu": ("cpu",),
}
# support for oot platform by providing mapping in current_platform
@@ -945,7 +948,7 @@ class NixlConnectorWorker:
# type based on kv_buffer_device
nixl_memory_type = current_platform.get_nixl_memory_type()
if nixl_memory_type is None:
if self.kv_buffer_device == "cuda":
if self.kv_buffer_device in ["cuda", "xpu"]:
nixl_memory_type = "VRAM"
elif self.kv_buffer_device == "cpu":
nixl_memory_type = "DRAM"

View File

@@ -221,6 +221,12 @@ class XPUPlatform(Platform):
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)
# In some cases, the internal memory type cache can misdetect GPU
# memory as host memory, also leading to invalid memory access.
# This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
# ref. https://openucx.readthedocs.io/en/master/faq.html
os.environ["UCX_MEMTYPE_CACHE"] = "n"
@classmethod
def support_hybrid_kv_cache(cls) -> bool:
return True