diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 484167f46..6f4a0decf 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -44,6 +44,17 @@ cleanup_docker() { fi } +cleanup_network() { + for node in $(seq 0 $((NUM_NODES-1))); do + if docker pr -a -q -f name="node${node}" | grep -q .; then + docker stop "node${node}" + fi + done + if docker network ls | grep docker-net; then + docker network rm docker-net + fi +} + # Call the cleanup docker function cleanup_docker @@ -224,6 +235,35 @@ if [[ $commands == *"--shard-id="* ]]; then echo "All shards reported no tests collected. Failing the build." exit 1 fi + +elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then + + export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') + + if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then + prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g') + echo "PREFIX: ${prefix}" + export composite_command="(command rocm-smi || true)" + myIFS=$IFS + IFS=',' + read -ra node0 <<< ${BASH_REMATCH[2]} + read -ra node1 <<< ${BASH_REMATCH[3]} + IFS=$myIFS + for i in "${!node0[@]}";do + command_node_0=$(echo ${node0[i]} | sed 's/\"//g') + command_node_1=$(echo ${node1[i]} | sed 's/\"//g') + + export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" + echo "COMMANDS: ${commands}" + composite_command=$(echo "${composite_command} && ${commands}") + done + /bin/bash -c "${composite_command}" + cleanup_network + else + echo "Failed to parse node commands! Exiting." + cleanup_network + exit 111 + fi else echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b4d8c7b86..6638e576a 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1278,7 +1278,7 @@ steps: - label: 2 Node Tests (4 GPUs in total) # 16min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdmultinode] agent_pool: mi325_4 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -1292,15 +1292,15 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min