2024-11-07 13:17:29 -05:00
#!/bin/bash
2026-02-24 15:37:15 -06:00
# This script runs tests inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
2026-02-27 23:53:28 -06:00
#
###############################################################################
# QUOTING / COMMAND PASSING
#
# Passing commands as positional arguments ($*) is fragile when the command
# string itself contains double quotes, e.g.:
#
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
#
# The outer shell resolves the nested quotes *before* this script runs, so
# the script receives mangled input it cannot fully recover.
#
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
#
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
# bash run-amd-test.sh
#
# Single-quoted assignment preserves all inner double quotes verbatim.
# The $* path is kept for backward compatibility but callers should migrate.
###############################################################################
2024-09-04 14:57:54 -04:00
set -o pipefail
2024-03-18 12:33:47 -07:00
2025-05-09 17:35:58 -05:00
# Export Python path
export PYTHONPATH = ".."
2026-02-24 15:37:15 -06:00
###############################################################################
# Helper Functions
###############################################################################
wait_for_clean_gpus( ) {
local timeout = ${ 1 :- 300 }
local start = $SECONDS
echo " --- Waiting for clean GPU state (timeout: ${ timeout } s) "
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
return
fi
if ( ( SECONDS - start >= timeout ) ) ; then
echo " Error: GPUs did not reach clean state within ${ timeout } s " >& 2
exit 1
fi
sleep 3
done
}
2024-03-18 12:33:47 -07:00
2024-05-29 22:27:39 -05:00
cleanup_docker( ) {
# Get Docker's root directory
docker_root = $( docker info -f '{{.DockerRootDir}}' )
if [ -z " $docker_root " ] ; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo " Docker root directory: $docker_root "
2026-02-24 15:37:15 -06:00
2024-05-29 22:27:39 -05:00
disk_usage = $( df " $docker_root " | tail -1 | awk '{print $5}' | sed 's/%//' )
threshold = 70
if [ " $disk_usage " -gt " $threshold " ] ; then
echo " Disk usage is above $threshold %. Cleaning up Docker images and volumes... "
docker image prune -f
2024-10-31 12:02:58 -05:00
docker volume prune -f && docker system prune --force --filter "until=72h" --all
2024-05-29 22:27:39 -05:00
echo "Docker images and volumes cleanup completed."
else
echo " Disk usage is below $threshold %. No cleanup needed. "
fi
}
2026-01-27 13:13:21 -06:00
cleanup_network( ) {
2026-02-24 15:37:15 -06:00
local max_nodes = ${ NUM_NODES :- 2 }
for node in $( seq 0 $(( max_nodes - 1 )) ) ; do
if docker ps -a -q -f name = " node ${ node } " | grep -q .; then
docker stop " node ${ node } " || true
2026-01-27 13:13:21 -06:00
fi
done
2026-02-24 15:37:15 -06:00
if docker network ls | grep -q docker-net; then
docker network rm docker-net || true
2026-01-27 13:13:21 -06:00
fi
}
2026-02-24 15:37:15 -06:00
is_multi_node( ) {
local cmds = " $1 "
# Primary signal: NUM_NODES environment variable set by the pipeline
if [ [ " ${ NUM_NODES :- 1 } " -gt 1 ] ] ; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [ [ " $cmds " = ~ \[ .*\] .*\& \& .*\[ .*\] ] ] ; then
return 0
fi
return 1
}
###############################################################################
2026-02-27 23:53:28 -06:00
# Pytest marker/keyword re-quoting
2026-02-24 15:37:15 -06:00
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
2026-02-27 23:53:28 -06:00
# quotes around multi-word pytest -m/-k expressions get stripped:
2026-02-24 15:37:15 -06:00
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
2026-02-27 23:53:28 -06:00
#
# This function detects unquoted expressions after -m/-k and re-quotes them
# by collecting tokens until a recognizable boundary is reached:
# - test path (contains '/')
# - test file (ends with '.py')
# - another pytest flag (--xxx or -x single-char flags)
# - command separator (&& || ; |)
# - environment variable assignment (FOO=bar)
#
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
# unquoted since they have no spaces and work fine.
#
# Already-quoted expressions (containing literal single quotes) are passed
# through untouched to avoid double-quoting values injected by
# apply_rocm_test_overrides.
#
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
# double-quotes stripped by the calling shell (see header comment).
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
2026-02-24 15:37:15 -06:00
###############################################################################
re_quote_pytest_markers( ) {
2026-02-27 23:53:28 -06:00
local input = " $1 "
local output = ""
local collecting = false
local marker_buf = ""
# Flatten newlines for consistent tokenization
local flat = " ${ input // $'\n' / } "
# Disable globbing to prevent *.py etc. from expanding during read -ra
local restore_glob
restore_glob = " $( shopt -p -o noglob 2>/dev/null || true ) "
set -o noglob
local -a words
read -ra words <<< " $flat "
eval " $restore_glob "
for word in " ${ words [@] } " ; do
if $collecting ; then
# If the token we're about to collect already contains a literal
# single quote, the expression was already quoted upstream.
# Flush and stop collecting.
if [ [ " $word " = = *"'" * ] ] ; then
if [ [ -n " $marker_buf " ] ] ; then
# Should not normally happen (partial buf + quote), flush raw
output += " ${ marker_buf } "
marker_buf = ""
fi
output += " ${ word } "
collecting = false
continue
fi
local is_boundary = false
case " $word " in
# Command separators
"&&" | "||" | ";" | "|" )
is_boundary = true ; ;
# Long flags (--ignore, --shard-id, etc.)
--*)
is_boundary = true ; ;
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
# like "not" which don't start with "-". Also skip -k/-m which
# would start a new marker (handled below).
-[ a-zA-Z] )
is_boundary = true ; ;
# Test path (contains /)
*/*)
is_boundary = true ; ;
# Test file (ends with .py, possibly with ::method)
*.py| *.py::*)
is_boundary = true ; ;
# Environment variable assignment preceding a command (FOO=bar)
*= *)
# Only treat as boundary if it looks like VAR=value, not
# pytest filter expressions like num_gpus=2 inside markers
if [ [ " $word " = ~ ^[ A-Z_] [ A-Z0-9_] *= ] ] ; then
is_boundary = true
fi
; ;
esac
if $is_boundary ; then
# Flush the collected marker expression
if [ [ " $marker_buf " = = *" " * || " $marker_buf " = = *"(" * ] ] ; then
output += " ' ${ marker_buf } ' "
else
output += " ${ marker_buf } "
fi
collecting = false
marker_buf = ""
# Check if this boundary word itself starts a new -m/-k
if [ [ " $word " = = "-m" || " $word " = = "-k" ] ] ; then
output += " ${ word } "
collecting = true
else
output += " ${ word } "
fi
else
# Accumulate into marker buffer
if [ [ -n " $marker_buf " ] ] ; then
marker_buf += " ${ word } "
else
marker_buf = " ${ word } "
fi
fi
elif [ [ " $word " = = "-m" || " $word " = = "-k" ] ] ; then
output += " ${ word } "
collecting = true
marker_buf = ""
else
output += " ${ word } "
fi
done
# Flush any trailing marker expression (marker at end of command)
if $collecting && [ [ -n " $marker_buf " ] ] ; then
if [ [ " $marker_buf " = = *" " * || " $marker_buf " = = *"(" * ] ] ; then
output += " ' ${ marker_buf } ' "
else
output += " ${ marker_buf } "
fi
fi
echo " ${ output % } "
2026-02-24 15:37:15 -06:00
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides( ) {
local cmds = " $1 "
# --- Model registry filter ---
if [ [ $cmds = = *"pytest -v -s models/test_registry.py" * ] ] ; then
cmds = ${ cmds // "pytest -v -s models/test_registry.py" / "pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'" }
fi
# --- LoRA: disable custom paged attention ---
if [ [ $cmds = = *"pytest -v -s lora" * ] ] ; then
cmds = ${ cmds // "pytest -v -s lora" / "VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora" }
fi
# --- Kernel ignores ---
if [ [ $cmds = = *" kernels/core" * ] ] ; then
cmds = " ${ cmds } \
--ignore= kernels/core/test_fused_quant_layernorm.py \
--ignore= kernels/core/test_permute_cols.py"
fi
if [ [ $cmds = = *" kernels/attention" * ] ] ; then
cmds = " ${ cmds } \
--ignore= kernels/attention/test_attention_selector.py \
--ignore= kernels/attention/test_encoder_decoder_attn.py \
--ignore= kernels/attention/test_flash_attn.py \
--ignore= kernels/attention/test_flashinfer.py \
--ignore= kernels/attention/test_prefix_prefill.py \
--ignore= kernels/attention/test_cascade_flash_attn.py \
--ignore= kernels/attention/test_mha_attn.py \
--ignore= kernels/attention/test_lightning_attn.py \
--ignore= kernels/attention/test_attention.py"
fi
if [ [ $cmds = = *" kernels/quantization" * ] ] ; then
cmds = " ${ cmds } \
--ignore= kernels/quantization/test_int8_quant.py \
--ignore= kernels/quantization/test_machete_mm.py \
--ignore= kernels/quantization/test_block_fp8.py \
--ignore= kernels/quantization/test_block_int8.py \
--ignore= kernels/quantization/test_marlin_gemm.py \
--ignore= kernels/quantization/test_cutlass_scaled_mm.py \
--ignore= kernels/quantization/test_int8_kernel.py"
fi
if [ [ $cmds = = *" kernels/mamba" * ] ] ; then
cmds = " ${ cmds } \
--ignore= kernels/mamba/test_mamba_mixer2.py \
--ignore= kernels/mamba/test_causal_conv1d.py \
--ignore= kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [ [ $cmds = = *" kernels/moe" * ] ] ; then
cmds = " ${ cmds } \
--ignore= kernels/moe/test_moe.py \
--ignore= kernels/moe/test_cutlass_moe.py \
--ignore= kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
# --- Entrypoint ignores ---
if [ [ $cmds = = *" entrypoints/openai " * ] ] ; then
cmds = ${ cmds // " entrypoints/openai " / " entrypoints/openai \
--ignore= entrypoints/openai/test_audio.py \
--ignore= entrypoints/openai/test_shutdown.py \
--ignore= entrypoints/openai/test_completion.py \
--ignore= entrypoints/openai/test_models.py \
--ignore= entrypoints/openai/test_lora_adapters.py \
--ignore= entrypoints/openai/test_return_tokens_as_ids.py \
--ignore= entrypoints/openai/test_root_path.py \
--ignore= entrypoints/openai/test_tokenization.py \
--ignore= entrypoints/openai/test_prompt_validation.py " }
fi
if [ [ $cmds = = *" entrypoints/llm " * ] ] ; then
cmds = ${ cmds // " entrypoints/llm " / " entrypoints/llm \
--ignore= entrypoints/llm/test_chat.py \
--ignore= entrypoints/llm/test_accuracy.py \
--ignore= entrypoints/llm/test_init.py \
--ignore= entrypoints/llm/test_prompt_validation.py " }
fi
# Clean up escaped newlines from --ignore appends
cmds = $( echo " $cmds " | sed 's/ \\ / /g' )
echo " $cmds "
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
2024-05-29 22:27:39 -05:00
cleanup_docker
2024-05-02 14:29:07 -05:00
echo "--- Resetting GPUs"
2024-04-25 11:37:20 -05:00
echo "reset" > /opt/amdgpu/etc/gpu_state
2026-02-24 15:37:15 -06:00
wait_for_clean_gpus
2024-04-25 11:37:20 -05:00
2026-02-24 15:37:15 -06:00
# --- Pull test image ---
2025-11-13 18:44:36 -08:00
echo "--- Pulling container"
2024-08-01 13:07:37 -05:00
image_name = " rocm/vllm-ci: ${ BUILDKITE_COMMIT } "
2024-07-12 00:26:26 -04:00
container_name = " rocm_ ${ BUILDKITE_COMMIT } _ $( tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo ) "
2024-11-07 13:17:29 -05:00
docker pull " ${ image_name } "
2024-05-02 14:29:07 -05:00
remove_docker_container( ) {
2026-02-24 15:37:15 -06:00
docker rm -f " ${ container_name } " || docker image rm -f " ${ image_name } " || true
2024-05-02 14:29:07 -05:00
}
trap remove_docker_container EXIT
2024-04-25 11:37:20 -05:00
2026-02-24 15:37:15 -06:00
# --- Prepare commands ---
2024-05-02 14:29:07 -05:00
echo "--- Running container"
2024-04-25 11:37:20 -05:00
2024-07-20 11:39:07 -05:00
HF_CACHE = " $( realpath ~) /huggingface "
2024-11-07 13:17:29 -05:00
mkdir -p " ${ HF_CACHE } "
2024-07-20 11:39:07 -05:00
HF_MOUNT = "/root/.cache/huggingface"
2026-02-27 23:53:28 -06:00
# ---- Command source selection ----
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
# Fall back to $* for backward compatibility, but warn that inner
# double-quotes will have been stripped by the calling shell.
if [ [ -n " ${ VLLM_TEST_COMMANDS :- } " ] ] ; then
commands = " ${ VLLM_TEST_COMMANDS } "
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
else
commands = " $* "
if [ [ -z " $commands " ] ] ; then
echo "Error: No test commands provided." >& 2
echo "Usage:" >& 2
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0 " >& 2
echo " Legacy: bash $0 \"commands here\" " >& 2
exit 1
fi
echo "Commands sourced from positional args (legacy mode)"
echo "WARNING: Inner double-quotes in the command string may have been"
echo " stripped by the calling shell. If you see syntax errors, switch to:"
echo " export VLLM_TEST_COMMANDS='your commands here'"
echo " bash $0 "
fi
2026-02-04 00:51:33 -06:00
echo " Raw commands: $commands "
2025-05-09 17:35:58 -05:00
2026-02-24 15:37:15 -06:00
# Fix quoting before ROCm overrides (so overrides see correct structure)
commands = $( re_quote_pytest_markers " $commands " )
2026-02-27 23:53:28 -06:00
echo " After re-quoting: $commands "
2026-02-24 15:37:15 -06:00
commands = $( apply_rocm_test_overrides " $commands " )
2026-02-04 00:51:33 -06:00
echo " Final commands: $commands "
2025-05-09 17:35:58 -05:00
MYPYTHONPATH = ".."
2026-02-24 15:37:15 -06:00
# Verify GPU access
2025-11-05 09:35:49 -06:00
render_gid = $( getent group render | cut -d: -f3)
if [ [ -z " $render_gid " ] ] ; then
echo "Error: 'render' group not found. This is required for GPU access." >& 2
exit 1
fi
2026-02-27 23:53:28 -06:00
# --- RDMA device passthrough (conditional) ---
# If the host has RDMA devices, pass them through so tests like
# test_moriio_connector can access ibverbs. On hosts without RDMA
# hardware the tests will gracefully skip via _rdma_available().
RDMA_FLAGS = ""
if [ -d /dev/infiniband ] ; then
echo "RDMA devices detected on host, enabling passthrough"
RDMA_FLAGS = "--device /dev/infiniband --cap-add=IPC_LOCK"
else
echo "No RDMA devices found on host, RDMA tests will be skipped"
fi
2026-02-24 15:37:15 -06:00
# --- Route: multi-node vs single-node ---
if is_multi_node " $commands " ; then
echo "--- Multi-node job detected"
2026-01-27 13:13:21 -06:00
export DCKR_VER = $( docker --version | sed 's/Docker version \(.*\), build .*/\1/' )
2026-02-24 15:37:15 -06:00
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
# BASH_REMATCH[1] = prefix (everything before first bracket)
# BASH_REMATCH[2] = comma-separated node0 commands
# BASH_REMATCH[3] = comma-separated node1 commands
if [ [ " $commands " = ~ ^( .*) \[ ( .*) "] && [" ( .*) \] $ ] ] ; then
prefix = $( echo " ${ BASH_REMATCH [1] } " | sed 's/;//g' )
echo " PREFIX: ${ prefix } "
export composite_command = "(command rocm-smi || true)"
saved_IFS = $IFS
IFS = ','
read -ra node0 <<< " ${ BASH_REMATCH [2] } "
read -ra node1 <<< " ${ BASH_REMATCH [3] } "
IFS = $saved_IFS
if [ [ ${# node0 [@] } -ne ${# node1 [@] } ] ] ; then
echo " Warning: node0 has ${# node0 [@] } commands, node1 has ${# node1 [@] } . They will be paired by index. "
fi
for i in " ${ !node0[@] } " ; do
command_node_0 = $( echo " ${ node0 [i] } " | sed 's/\"//g' )
command_node_1 = $( echo " ${ node1 [i] } " | sed 's/\"//g' )
step_cmd = " ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${ image_name } ' ${ command_node_0 } ' ' ${ command_node_1 } ' "
echo " COMMANDS: ${ step_cmd } "
composite_command = " ${ composite_command } && ${ step_cmd } "
done
/bin/bash -c " ${ composite_command } "
cleanup_network
2026-01-27 13:13:21 -06:00
else
2026-02-24 15:37:15 -06:00
echo "Multi-node job detected but failed to parse bracket command syntax."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
echo " Got: $commands "
cleanup_network
exit 111
2026-01-27 13:13:21 -06:00
fi
2024-09-04 14:57:54 -04:00
else
2026-02-24 15:37:15 -06:00
echo "--- Single-node job"
2025-03-26 15:35:11 -05:00
echo " Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES "
2024-09-04 14:57:54 -04:00
docker run \
2026-02-24 15:37:15 -06:00
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
2026-02-27 23:53:28 -06:00
$RDMA_FLAGS \
2026-02-24 15:37:15 -06:00
--network= host \
--shm-size= 16gb \
--group-add " $render_gid " \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v " ${ HF_CACHE } : ${ HF_MOUNT } " \
-e " HF_HOME= ${ HF_MOUNT } " \
-e " PYTHONPATH= ${ MYPYTHONPATH } " \
--name " ${ container_name } " \
" ${ image_name } " \
/bin/bash -c " ${ commands } "
2024-09-04 14:57:54 -04:00
fi