Enabling "2 node" distributed tests in the AMD CI pipeline. (#32719)
Signed-off-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
This commit is contained in:
committed by
GitHub
parent
1cbccb6dba
commit
3c3c547ce0
@@ -44,6 +44,17 @@ cleanup_docker() {
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
for node in $(seq 0 $((NUM_NODES-1))); do
|
||||
if docker pr -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}"
|
||||
fi
|
||||
done
|
||||
if docker network ls | grep docker-net; then
|
||||
docker network rm docker-net
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
@@ -224,6 +235,35 @@ if [[ $commands == *"--shard-id="* ]]; then
|
||||
echo "All shards reported no tests collected. Failing the build."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
|
||||
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
myIFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< ${BASH_REMATCH[2]}
|
||||
read -ra node1 <<< ${BASH_REMATCH[3]}
|
||||
IFS=$myIFS
|
||||
for i in "${!node0[@]}";do
|
||||
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
|
||||
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
|
||||
|
||||
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${commands}"
|
||||
composite_command=$(echo "${composite_command} && ${commands}")
|
||||
done
|
||||
/bin/bash -c "${composite_command}"
|
||||
cleanup_network
|
||||
else
|
||||
echo "Failed to parse node commands! Exiting."
|
||||
cleanup_network
|
||||
exit 111
|
||||
fi
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
|
||||
Reference in New Issue
Block a user