diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 6f4a0decf..f36909396 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -87,7 +87,7 @@ mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" commands=$@ -echo "Commands:$commands" +echo "Raw commands: $commands" commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"} @@ -169,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_prompt_validation.py "} fi +commands=$(echo "$commands" | sed 's/ \\ / /g') +echo "Final commands: $commands" + # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py @@ -176,7 +179,6 @@ fi # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 -PARALLEL_JOB_COUNT=8 MYPYTHONPATH=".." # Test that we're launching on the machine that has @@ -187,56 +189,7 @@ if [[ -z "$render_gid" ]]; then exit 1 fi -# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. -if [[ $commands == *"--shard-id="* ]]; then - # assign job count as the number of shards used - commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g') - for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do - # assign shard-id for each shard - commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g') - echo "Shard ${GPU} commands:$commands_gpu" - echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" - docker run \ - --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ - --network=host \ - --shm-size=16gb \ - --group-add "$render_gid" \ - --rm \ - -e HIP_VISIBLE_DEVICES="${GPU}" \ - -e HF_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "PYTHONPATH=${MYPYTHONPATH}" \ - --name "${container_name}_${GPU}" \ - "${image_name}" \ - /bin/bash -c "${commands_gpu}" \ - |& while read -r line; do echo ">>Shard $GPU: $line"; done & - PIDS+=($!) - done - #wait for all processes to finish and collect exit codes - for pid in "${PIDS[@]}"; do - wait "${pid}" - STATUS+=($?) - done - at_least_one_shard_with_tests=0 - for st in "${STATUS[@]}"; do - if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then - echo "One of the processes failed with $st" - exit "${st}" - elif [[ ${st} -eq 5 ]]; then - echo "Shard exited with status 5 (no tests collected) - treating as success" - else # This means st is 0 - at_least_one_shard_with_tests=1 - fi - done - if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then - echo "All shards reported no tests collected. Failing the build." - exit 1 - fi - -elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then +if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a14dcd030..ee7c6ab0a 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -542,7 +542,7 @@ steps: - label: LoRA Test %N # 20min each timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - vllm/lora @@ -636,7 +636,7 @@ steps: - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/attention/ @@ -651,7 +651,7 @@ steps: - label: Kernels Quantization Test %N # 64min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/quantization/ @@ -664,7 +664,7 @@ steps: - label: Kernels MoE Test %N # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ @@ -742,7 +742,7 @@ steps: - label: Benchmarks # 11min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: @@ -753,7 +753,7 @@ steps: - label: Benchmarks CLI Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - vllm/ @@ -827,7 +827,7 @@ steps: - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -888,7 +888,7 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -909,7 +909,7 @@ steps: - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: