2024-11-07 13:17:29 -05:00
#!/bin/bash
2024-05-16 22:58:25 -05:00
# This script runs test inside the corresponding ROCm docker container.
2024-09-04 14:57:54 -04:00
set -o pipefail
2024-03-18 12:33:47 -07:00
2025-05-09 17:35:58 -05:00
# Export Python path
export PYTHONPATH = ".."
2024-03-18 12:33:47 -07:00
# Print ROCm version
2024-07-12 11:42:24 -05:00
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
2024-05-02 14:29:07 -05:00
echo "--- ROCm info"
2024-03-18 12:33:47 -07:00
rocminfo
2024-05-29 22:27:39 -05:00
# cleanup older docker images
cleanup_docker( ) {
# Get Docker's root directory
docker_root = $( docker info -f '{{.DockerRootDir}}' )
if [ -z " $docker_root " ] ; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo " Docker root directory: $docker_root "
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage = $( df " $docker_root " | tail -1 | awk '{print $5}' | sed 's/%//' )
# Define the threshold
threshold = 70
if [ " $disk_usage " -gt " $threshold " ] ; then
echo " Disk usage is above $threshold %. Cleaning up Docker images and volumes... "
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
2024-10-31 12:02:58 -05:00
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
2024-05-29 22:27:39 -05:00
echo "Docker images and volumes cleanup completed."
else
echo " Disk usage is below $threshold %. No cleanup needed. "
fi
}
# Call the cleanup docker function
cleanup_docker
2024-05-02 14:29:07 -05:00
echo "--- Resetting GPUs"
2024-04-25 11:37:20 -05:00
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
2024-07-12 00:26:26 -04:00
echo "--- Pulling container"
2024-08-01 13:07:37 -05:00
image_name = " rocm/vllm-ci: ${ BUILDKITE_COMMIT } "
2024-07-12 00:26:26 -04:00
container_name = " rocm_ ${ BUILDKITE_COMMIT } _ $( tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo ) "
2024-11-07 13:17:29 -05:00
docker pull " ${ image_name } "
2024-05-02 14:29:07 -05:00
remove_docker_container( ) {
2024-11-07 13:17:29 -05:00
docker rm -f " ${ container_name } " || docker image rm -f " ${ image_name } " || true
2024-05-02 14:29:07 -05:00
}
trap remove_docker_container EXIT
2024-04-25 11:37:20 -05:00
2024-05-02 14:29:07 -05:00
echo "--- Running container"
2024-04-25 11:37:20 -05:00
2024-07-20 11:39:07 -05:00
HF_CACHE = " $( realpath ~) /huggingface "
2024-11-07 13:17:29 -05:00
mkdir -p " ${ HF_CACHE } "
2024-07-20 11:39:07 -05:00
HF_MOUNT = "/root/.cache/huggingface"
2024-09-04 14:57:54 -04:00
commands = $@
2024-09-10 14:51:15 -04:00
echo " Commands: $commands "
2025-05-09 17:35:58 -05:00
if [ [ $commands = = *"pytest -v -s basic_correctness/test_basic_correctness.py" * ] ] ; then
commands = ${ commands // "pytest -v -s basic_correctness/test_basic_correctness.py" / "VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py" }
fi
2025-05-15 10:49:23 -05:00
if [ [ $commands = = *"pytest -v -s models/test_registry.py" * ] ] ; then
commands = ${ commands // "pytest -v -s models/test_registry.py" / "pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'" }
fi
if [ [ $commands = = *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'" * ] ] ; then
commands = ${ commands // "VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'" / "VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'" }
fi
2025-05-09 17:35:58 -05:00
if [ [ $commands = = *"pytest -v -s compile/test_basic_correctness.py" * ] ] ; then
commands = ${ commands // "pytest -v -s compile/test_basic_correctness.py" / "VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py" }
fi
2025-06-02 22:46:44 -05:00
if [ [ $commands = = *"pytest -v -s lora" * ] ] ; then
commands = ${ commands // "pytest -v -s lora" / "VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora" }
fi
2024-09-10 14:51:15 -04:00
#ignore certain kernels tests
2025-04-29 12:27:27 -05:00
if [ [ $commands = = *" kernels/core" * ] ] ; then
2024-09-10 14:51:15 -04:00
commands = " ${ commands } \
2025-04-29 12:27:27 -05:00
--ignore= kernels/core/test_fused_quant_layernorm.py \
--ignore= kernels/core/test_permute_cols.py"
fi
if [ [ $commands = = *" kernels/attention" * ] ] ; then
commands = " ${ commands } \
2025-07-07 00:54:36 +08:00
--ignore= kernels/attention/test_attention_selector.py \
2025-04-29 12:27:27 -05:00
--ignore= kernels/attention/test_encoder_decoder_attn.py \
--ignore= kernels/attention/test_flash_attn.py \
--ignore= kernels/attention/test_flashinfer.py \
--ignore= kernels/attention/test_prefix_prefill.py \
--ignore= kernels/attention/test_cascade_flash_attn.py \
--ignore= kernels/attention/test_mha_attn.py \
--ignore= kernels/attention/test_lightning_attn.py \
--ignore= kernels/attention/test_attention.py"
fi
if [ [ $commands = = *" kernels/quantization" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/quantization/test_int8_quant.py \
--ignore= kernels/quantization/test_machete_mm.py \
--ignore= kernels/quantization/test_block_fp8.py \
--ignore= kernels/quantization/test_block_int8.py \
--ignore= kernels/quantization/test_marlin_gemm.py \
--ignore= kernels/quantization/test_cutlass_scaled_mm.py \
--ignore= kernels/quantization/test_int8_kernel.py"
fi
if [ [ $commands = = *" kernels/mamba" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/mamba/test_mamba_mixer2.py \
--ignore= kernels/mamba/test_causal_conv1d.py \
--ignore= kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [ [ $commands = = *" kernels/moe" * ] ] ; then
commands = " ${ commands } \
--ignore= kernels/moe/test_moe.py \
--ignore= kernels/moe/test_cutlass_moe.py \
--ignore= kernels/moe/test_triton_moe_ptpc_fp8.py"
2024-09-10 14:51:15 -04:00
fi
2025-03-14 14:18:13 -05:00
#ignore certain Entrypoints/openai tests
2024-09-19 16:06:32 -04:00
if [ [ $commands = = *" entrypoints/openai " * ] ] ; then
commands = ${ commands // " entrypoints/openai " / " entrypoints/openai \
--ignore= entrypoints/openai/test_audio.py \
2025-03-14 14:18:13 -05:00
--ignore= entrypoints/openai/test_shutdown.py \
--ignore= entrypoints/openai/test_completion.py \
--ignore= entrypoints/openai/test_sleep.py \
--ignore= entrypoints/openai/test_models.py \
2025-04-03 13:05:17 -05:00
--ignore= entrypoints/openai/test_lora_adapters.py \
--ignore= entrypoints/openai/test_return_tokens_as_ids.py \
--ignore= entrypoints/openai/test_root_path.py \
--ignore= entrypoints/openai/test_tokenization.py \
2025-03-14 14:18:13 -05:00
--ignore= entrypoints/openai/test_prompt_validation.py " }
2024-09-19 16:06:32 -04:00
fi
2025-03-14 14:18:13 -05:00
#ignore certain Entrypoints/llm tests
2025-04-03 13:05:17 -05:00
if [ [ $commands = = *" entrypoints/llm " * ] ] ; then
commands = ${ commands // " entrypoints/llm " / " entrypoints/llm \
--ignore= entrypoints/llm/test_chat.py \
--ignore= entrypoints/llm/test_accuracy.py \
--ignore= entrypoints/llm/test_init.py \
--ignore= entrypoints/llm/test_prompt_validation.py " }
2025-03-14 14:18:13 -05:00
fi
2025-04-03 13:05:17 -05:00
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
2025-03-14 14:18:13 -05:00
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
2024-09-04 14:57:54 -04:00
PARALLEL_JOB_COUNT = 8
2025-05-09 17:35:58 -05:00
MYPYTHONPATH = ".."
2024-09-04 14:57:54 -04:00
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [ [ $commands = = *"--shard-id=" * ] ] ; then
2024-11-04 21:37:46 +02:00
# assign job count as the number of shards used
commands = ${ commands // "--num-shards= " / " --num-shards= ${ PARALLEL_JOB_COUNT } " }
2024-09-04 14:57:54 -04:00
for GPU in $( seq 0 $(( $PARALLEL_JOB_COUNT - 1 )) ) ; do
2024-11-04 21:37:46 +02:00
# assign shard-id for each shard
commands_gpu = ${ commands // "--shard-id= " / " --shard-id= ${ GPU } " }
echo " Shard ${ GPU } commands: $commands_gpu "
2025-03-26 15:35:11 -05:00
echo " Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES "
2024-09-04 14:57:54 -04:00
docker run \
2025-03-26 15:35:11 -05:00
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network= host \
2024-07-20 11:39:07 -05:00
--shm-size= 16gb \
2024-05-02 14:29:07 -05:00
--rm \
2024-11-07 13:17:29 -05:00
-e HIP_VISIBLE_DEVICES = " ${ GPU } " \
2024-05-02 14:29:07 -05:00
-e HF_TOKEN \
2025-02-19 19:56:06 -08:00
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
2024-11-07 13:17:29 -05:00
-v " ${ HF_CACHE } : ${ HF_MOUNT } " \
-e " HF_HOME= ${ HF_MOUNT } " \
2025-05-09 17:35:58 -05:00
-e " PYTHONPATH= ${ MYPYTHONPATH } " \
2024-11-07 13:17:29 -05:00
--name " ${ container_name } _ ${ GPU } " \
" ${ image_name } " \
2024-11-04 21:37:46 +02:00
/bin/bash -c " ${ commands_gpu } " \
2024-09-04 14:57:54 -04:00
| & while read -r line; do echo " >>Shard $GPU : $line " ; done &
PIDS += ( $! )
done
#wait for all processes to finish and collect exit codes
2024-11-07 13:17:29 -05:00
for pid in " ${ PIDS [@] } " ; do
wait " ${ pid } "
2024-09-04 14:57:54 -04:00
STATUS += ( $? )
done
2024-11-07 13:17:29 -05:00
for st in " ${ STATUS [@] } " ; do
2024-09-04 14:57:54 -04:00
if [ [ ${ st } -ne 0 ] ] ; then
echo " One of the processes failed with $st "
2024-11-07 13:17:29 -05:00
exit " ${ st } "
2024-09-04 14:57:54 -04:00
fi
done
else
2025-03-26 15:35:11 -05:00
echo " Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES "
2024-09-04 14:57:54 -04:00
docker run \
2025-03-26 15:35:11 -05:00
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network= host \
2024-09-04 14:57:54 -04:00
--shm-size= 16gb \
--rm \
-e HIP_VISIBLE_DEVICES = 0 \
-e HF_TOKEN \
2025-02-19 19:56:06 -08:00
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
2024-11-07 13:17:29 -05:00
-v " ${ HF_CACHE } : ${ HF_MOUNT } " \
-e " HF_HOME= ${ HF_MOUNT } " \
2025-05-09 17:35:58 -05:00
-e " PYTHONPATH= ${ MYPYTHONPATH } " \
2024-11-07 13:17:29 -05:00
--name " ${ container_name } " \
" ${ image_name } " \
2024-09-04 14:57:54 -04:00
/bin/bash -c " ${ commands } "
fi