[ROCm][CI] Add rocm support for run-multi-node-test.sh (#31922)

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Charlie Fu
2026-01-07 22:36:39 -06:00
committed by GitHub
parent 087a138963
commit cddbc2b4b2

View File

@@ -2,6 +2,17 @@
set -euox pipefail
# To detect ROCm
# Check multiple indicators:
if [ -e /dev/kfd ] || \
[ -d /opt/rocm ] || \
command -v rocm-smi &> /dev/null || \
[ -n "$ROCM_HOME" ]; then
IS_ROCM=1
else
IS_ROCM=0
fi
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
echo "$command"
done
start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}
start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
if [ "$IS_ROCM" -eq 1 ]; then
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
else
GPU_DEVICES='--gpus "device='
fi
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
@@ -40,7 +56,9 @@ start_nodes() {
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
if [ "$IS_ROCM" -eq 0 ]; then
GPU_DEVICES+='"'
fi
# start the container in detached mode
# things to note:
@@ -49,7 +67,7 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
/bin/bash -c "tail -f /dev/null"