[AMD][Build] Add DeepEP to ROCm Dockerfile (#36086)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
This commit is contained in:
Ryan Rock
2026-03-12 16:33:32 -05:00
committed by GitHub
parent cc8f1f4764
commit a79c1c2c80
2 changed files with 49 additions and 0 deletions

View File

@@ -2071,6 +2071,14 @@ steps:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels FP8 MoE Test
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_moe.py
- label: Kernels Mamba Test # 31min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
@@ -3801,6 +3809,14 @@ steps:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels FP8 MoE Test
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi355_2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_moe.py
- label: Kernels Mamba Test # 31min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]

View File

@@ -184,6 +184,34 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
# DeepEP build stage
FROM base AS build_deep
ARG ROCSHMEM_BRANCH="ba0bf0f3"
ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
ARG DEEPEP_BRANCH="e84464ec"
ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
ARG DEEPEP_NIC="cx7"
ENV ROCSHMEM_DIR=/opt/rocshmem
RUN git clone ${ROCSHMEM_REPO} \
&& cd rocm-systems \
&& git checkout ${ROCSHMEM_BRANCH} \
&& mkdir -p projects/rocshmem/build \
&& cd projects/rocshmem/build \
&& cmake .. \
-DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \
-DROCM_PATH=/opt/rocm \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DUSE_EXTERNAL_MPI=OFF \
&& make -j \
&& make install
# Build DeepEP wheel.
# DeepEP looks for rocshmem at ROCSHMEM_DIR.
RUN git clone ${DEEPEP_REPO} \
&& cd DeepEP \
&& git checkout ${DEEPEP_BRANCH} \
&& python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
# -----------------------
# vLLM wheel release build stage (for building distributable wheels)
@@ -305,6 +333,11 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
# Install DeepEP wheel
RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
uv pip install --system /deep_install/*.whl
COPY --from=build_deep /opt/rocshmem /opt/rocshmem
# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
RUN apt-get update -q -y && apt-get install -q -y \
librdmacm1 \