[AMD][Build] Add DeepEP to ROCm Dockerfile (#36086)
Signed-off-by: Ryan Rock <ryan.rock@amd.com>
This commit is contained in:
@@ -2071,6 +2071,14 @@ steps:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels FP8 MoE Test
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
|
||||
- label: Kernels Mamba Test # 31min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@@ -3801,6 +3809,14 @@ steps:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels FP8 MoE Test
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi355_2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
|
||||
- label: Kernels Mamba Test # 31min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
|
||||
@@ -184,6 +184,34 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
||||
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
||||
|
||||
# DeepEP build stage
|
||||
FROM base AS build_deep
|
||||
ARG ROCSHMEM_BRANCH="ba0bf0f3"
|
||||
ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
|
||||
ARG DEEPEP_BRANCH="e84464ec"
|
||||
ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
|
||||
ARG DEEPEP_NIC="cx7"
|
||||
ENV ROCSHMEM_DIR=/opt/rocshmem
|
||||
|
||||
RUN git clone ${ROCSHMEM_REPO} \
|
||||
&& cd rocm-systems \
|
||||
&& git checkout ${ROCSHMEM_BRANCH} \
|
||||
&& mkdir -p projects/rocshmem/build \
|
||||
&& cd projects/rocshmem/build \
|
||||
&& cmake .. \
|
||||
-DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \
|
||||
-DROCM_PATH=/opt/rocm \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DUSE_EXTERNAL_MPI=OFF \
|
||||
&& make -j \
|
||||
&& make install
|
||||
|
||||
# Build DeepEP wheel.
|
||||
# DeepEP looks for rocshmem at ROCSHMEM_DIR.
|
||||
RUN git clone ${DEEPEP_REPO} \
|
||||
&& cd DeepEP \
|
||||
&& git checkout ${DEEPEP_BRANCH} \
|
||||
&& python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
|
||||
|
||||
# -----------------------
|
||||
# vLLM wheel release build stage (for building distributable wheels)
|
||||
@@ -305,6 +333,11 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
|
||||
uv pip install --system /rixl_install/*.whl
|
||||
|
||||
# Install DeepEP wheel
|
||||
RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
|
||||
uv pip install --system /deep_install/*.whl
|
||||
COPY --from=build_deep /opt/rocshmem /opt/rocshmem
|
||||
|
||||
# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
librdmacm1 \
|
||||
|
||||
Reference in New Issue
Block a user