2024-11-07 13:17:29 -05:00
#!/bin/bash
2024-05-16 22:58:25 -05:00
# This script runs test inside the corresponding ROCm docker container.
2024-09-04 14:57:54 -04:00
set -o pipefail
2024-03-18 12:33:47 -07:00
2025-05-09 17:35:58 -05:00
# Export Python path
export PYTHONPATH = ".."
2024-03-18 12:33:47 -07:00
# Print ROCm version
2024-07-12 11:42:24 -05:00
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
2024-05-02 14:29:07 -05:00
echo "--- ROCm info"
2024-03-18 12:33:47 -07:00
rocminfo
2024-05-29 22:27:39 -05:00
# cleanup older docker images
cleanup_docker( ) {
# Get Docker's root directory
docker_root = $( docker info -f '{{.DockerRootDir}}' )
if [ -z " $docker_root " ] ; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo " Docker root directory: $docker_root "
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage = $( df " $docker_root " | tail -1 | awk '{print $5}' | sed 's/%//' )
# Define the threshold
threshold = 70
if [ " $disk_usage " -gt " $threshold " ] ; then
echo " Disk usage is above $threshold %. Cleaning up Docker images and volumes... "
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
2024-10-31 12:02:58 -05:00
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
2024-05-29 22:27:39 -05:00
echo "Docker images and volumes cleanup completed."
else
echo " Disk usage is below $threshold %. No cleanup needed. "
fi
}
# Call the cleanup docker function
cleanup_docker
2024-05-02 14:29:07 -05:00
echo "--- Resetting GPUs"
2024-04-25 11:37:20 -05:00
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
2025-11-13 18:44:36 -08:00
echo "--- Pulling container"
2024-08-01 13:07:37 -05:00
image_name = " rocm/vllm-ci: ${ BUILDKITE_COMMIT } "
2024-07-12 00:26:26 -04:00
container_name = " rocm_ ${ BUILDKITE_COMMIT } _ $( tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo ) "
2024-11-07 13:17:29 -05:00
docker pull " ${ image_name } "
2024-05-02 14:29:07 -05:00
remove_docker_container( ) {
2024-11-07 13:17:29 -05:00
docker rm -f " ${ container_name } " || docker image rm -f " ${ image_name } " || true
2024-05-02 14:29:07 -05:00
}
trap remove_docker_container EXIT
2024-04-25 11:37:20 -05:00
2024-05-02 14:29:07 -05:00
echo "--- Running container"
2024-04-25 11:37:20 -05:00
2024-07-20 11:39:07 -05:00
HF_CACHE = " $( realpath ~) /huggingface "
2024-11-07 13:17:29 -05:00
mkdir -p " ${ HF_CACHE } "
2024-07-20 11:39:07 -05:00
HF_MOUNT = "/root/.cache/huggingface"
2024-09-04 14:57:54 -04:00
commands = $@
2024-09-10 14:51:15 -04:00
echo " Commands: $commands "
2025-05-09 17:35:58 -05:00
2025-11-11 20:34:36 -06:00
commands = ${ commands // "pytest -v -s basic_correctness/test_basic_correctness.py" / "pytest -v -s basic_correctness/test_basic_correctness.py" }
2025-05-09 17:35:58 -05:00
2025-05-15 10:49:23 -05:00
if [ [ $commands = = *"pytest -v -s models/test_registry.py" * ] ] ; then
commands = ${ commands // "pytest -v -s models/test_registry.py" / "pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'" }
fi
2025-11-11 20:34:36 -06:00
commands = ${ commands // "pytest -v -s compile/test_basic_correctness.py" / "pytest -v -s compile/test_basic_correctness.py" }
2025-05-09 17:35:58 -05:00
2025-06-02 22:46:44 -05:00
if [ [ $commands = = *"pytest -v -s lora" * ] ] ; then
commands = ${ commands // "pytest -v -s lora" / "VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora" }
fi
2024-09-10 14:51:15 -04:00
#ignore certain kernels tests
2025-04-29 12:27:27 -05:00
if [ [ $commands = = *" kernels/core" * ] ] ; then
2024-09-10 14:51:15 -04:00
commands = " ${ commands } \
2025-04-29 12:27:27 -05:00
--ignore= kernels/core/test_fused_quant_layernorm.py \
--ignore= kernels/core/test_permute_cols.py"
fi
if [ [ $commands = = *" kernels/attention" * ] ] ; then
commands = " ${ commands } \
2025-07-07 00:54:36 +08:00
--ignore= kernels/attention/test_attention_selector.py \
2025-04-29 12:27:27 -05:00
--ignore= kernels/attention/test_encoder_decoder_attn.py \
--ignore= kernels/attention/test_flash_attn.py \
--ignore= kernels/attention/test_flashinfer.py \
--ignore= kernels/attention/test_prefix_prefill.py \
--ignore= kernels/attention/test_cascade_flash_attn.py \
--ignore= kernels/attention/test_mha_attn.py \
--ignore= kernels/attention/test_lightning_attn.py \
--ignore= kernels/attention/test_attention.py"
fi
if [ [ $commands = = *" kernels/quantization" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/quantization/test_int8_quant.py \
--ignore= kernels/quantization/test_machete_mm.py \
--ignore= kernels/quantization/test_block_fp8.py \
--ignore= kernels/quantization/test_block_int8.py \
--ignore= kernels/quantization/test_marlin_gemm.py \
--ignore= kernels/quantization/test_cutlass_scaled_mm.py \
--ignore= kernels/quantization/test_int8_kernel.py"
fi
if [ [ $commands = = *" kernels/mamba" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/mamba/test_mamba_mixer2.py \
--ignore= kernels/mamba/test_causal_conv1d.py \
--ignore= kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [ [ $commands = = *" kernels/moe" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/moe/test_moe.py \
--ignore= kernels/moe/test_cutlass_moe.py \
--ignore= kernels/moe/test_triton_moe_ptpc_fp8.py"
2024-09-10 14:51:15 -04:00
fi
2025-03-14 14:18:13 -05:00
#ignore certain Entrypoints/openai tests
2024-09-19 16:06:32 -04:00
if [ [ $commands = = *" entrypoints/openai " * ] ] ; then
commands = ${ commands // " entrypoints/openai " / " entrypoints/openai \
--ignore= entrypoints/openai/test_audio.py \
2025-03-14 14:18:13 -05:00
--ignore= entrypoints/openai/test_shutdown.py \
--ignore= entrypoints/openai/test_completion.py \
--ignore= entrypoints/openai/test_models.py \
2025-04-03 13:05:17 -05:00
--ignore= entrypoints/openai/test_lora_adapters.py \
--ignore= entrypoints/openai/test_return_tokens_as_ids.py \
--ignore= entrypoints/openai/test_root_path.py \
--ignore= entrypoints/openai/test_tokenization.py \
2025-03-14 14:18:13 -05:00
--ignore= entrypoints/openai/test_prompt_validation.py " }
2024-09-19 16:06:32 -04:00
fi
2025-03-14 14:18:13 -05:00
#ignore certain Entrypoints/llm tests
2025-04-03 13:05:17 -05:00
if [ [ $commands = = *" entrypoints/llm " * ] ] ; then
commands = ${ commands // " entrypoints/llm " / " entrypoints/llm \
--ignore= entrypoints/llm/test_chat.py \
--ignore= entrypoints/llm/test_accuracy.py \
--ignore= entrypoints/llm/test_init.py \
--ignore= entrypoints/llm/test_prompt_validation.py " }
2025-03-14 14:18:13 -05:00
fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
2024-09-04 14:57:54 -04:00
PARALLEL_JOB_COUNT = 8
2025-05-09 17:35:58 -05:00
MYPYTHONPATH = ".."
2025-11-05 09:35:49 -06:00
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid = $( getent group render | cut -d: -f3)
if [ [ -z " $render_gid " ] ] ; then
echo "Error: 'render' group not found. This is required for GPU access." >& 2
exit 1
fi
2025-11-13 18:44:36 -08:00
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
2024-09-04 14:57:54 -04:00
if [ [ $commands = = *"--shard-id=" * ] ] ; then
2025-11-13 18:44:36 -08:00
# assign job count as the number of shards used
commands = $( echo " $commands " | sed -E " s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards= ${ PARALLEL_JOB_COUNT } /g " | sed 's/ \\ / /g' )
2024-09-04 14:57:54 -04:00
for GPU in $( seq 0 $(( $PARALLEL_JOB_COUNT - 1 )) ) ; do
2024-11-04 21:37:46 +02:00
# assign shard-id for each shard
2025-11-13 18:44:36 -08:00
commands_gpu = $( echo " $commands " | sed -E " s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id= ${ GPU } /g " | sed 's/ \\ / /g' )
2024-11-04 21:37:46 +02:00
echo " Shard ${ GPU } commands: $commands_gpu "
2025-03-26 15:35:11 -05:00
echo " Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES "
2024-09-04 14:57:54 -04:00
docker run \
2025-03-26 15:35:11 -05:00
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network= host \
2024-07-20 11:39:07 -05:00
--shm-size= 16gb \
2025-11-05 09:35:49 -06:00
--group-add " $render_gid " \
2024-05-02 14:29:07 -05:00
--rm \
2024-11-07 13:17:29 -05:00
-e HIP_VISIBLE_DEVICES = " ${ GPU } " \
2024-05-02 14:29:07 -05:00
-e HF_TOKEN \
2025-02-19 19:56:06 -08:00
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
2024-11-07 13:17:29 -05:00
-v " ${ HF_CACHE } : ${ HF_MOUNT } " \
-e " HF_HOME= ${ HF_MOUNT } " \
2025-05-09 17:35:58 -05:00
-e " PYTHONPATH= ${ MYPYTHONPATH } " \
2024-11-07 13:17:29 -05:00
--name " ${ container_name } _ ${ GPU } " \
" ${ image_name } " \
2024-11-04 21:37:46 +02:00
/bin/bash -c " ${ commands_gpu } " \
2024-09-04 14:57:54 -04:00
| & while read -r line; do echo " >>Shard $GPU : $line " ; done &
PIDS += ( $! )
done
#wait for all processes to finish and collect exit codes
2024-11-07 13:17:29 -05:00
for pid in " ${ PIDS [@] } " ; do
wait " ${ pid } "
2024-09-04 14:57:54 -04:00
STATUS += ( $? )
done
2026-01-12 16:35:49 -06:00
at_least_one_shard_with_tests = 0
2024-11-07 13:17:29 -05:00
for st in " ${ STATUS [@] } " ; do
2026-01-12 16:35:49 -06:00
if [ [ ${ st } -ne 0 ] ] && [ [ ${ st } -ne 5 ] ] ; then
2024-09-04 14:57:54 -04:00
echo " One of the processes failed with $st "
2024-11-07 13:17:29 -05:00
exit " ${ st } "
2026-01-12 16:35:49 -06:00
elif [ [ ${ st } -eq 5 ] ] ; then
echo "Shard exited with status 5 (no tests collected) - treating as success"
else # This means st is 0
at_least_one_shard_with_tests = 1
2024-09-04 14:57:54 -04:00
fi
done
2026-01-12 16:35:49 -06:00
if [ [ ${# STATUS [@] } -gt 0 && ${ at_least_one_shard_with_tests } -eq 0 ] ] ; then
echo "All shards reported no tests collected. Failing the build."
exit 1
fi
2024-09-04 14:57:54 -04:00
else
2025-03-26 15:35:11 -05:00
echo " Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES "
2024-09-04 14:57:54 -04:00
docker run \
2025-03-26 15:35:11 -05:00
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network= host \
2024-09-04 14:57:54 -04:00
--shm-size= 16gb \
2025-11-05 09:35:49 -06:00
--group-add " $render_gid " \
2024-09-04 14:57:54 -04:00
--rm \
-e HF_TOKEN \
2025-02-19 19:56:06 -08:00
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
2024-11-07 13:17:29 -05:00
-v " ${ HF_CACHE } : ${ HF_MOUNT } " \
-e " HF_HOME= ${ HF_MOUNT } " \
2025-05-09 17:35:58 -05:00
-e " PYTHONPATH= ${ MYPYTHONPATH } " \
2024-11-07 13:17:29 -05:00
--name " ${ container_name } " \
" ${ image_name } " \
2024-09-04 14:57:54 -04:00
/bin/bash -c " ${ commands } "
fi