519 lines
17 KiB
Bash
Executable File
519 lines
17 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# This script runs tests inside the corresponding ROCm docker container.
|
|
# It handles both single-node and multi-node test configurations.
|
|
#
|
|
# Multi-node detection: Instead of matching on fragile group names, we detect
|
|
# multi-node jobs structurally by looking for the bracket command syntax
|
|
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
|
|
#
|
|
###############################################################################
|
|
# QUOTING / COMMAND PASSING
|
|
#
|
|
# Passing commands as positional arguments ($*) is fragile when the command
|
|
# string itself contains double quotes, e.g.:
|
|
#
|
|
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
|
|
#
|
|
# The outer shell resolves the nested quotes *before* this script runs, so
|
|
# the script receives mangled input it cannot fully recover.
|
|
#
|
|
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
|
|
#
|
|
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
|
|
# bash run-amd-test.sh
|
|
#
|
|
# Single-quoted assignment preserves all inner double quotes verbatim.
|
|
# The $* path is kept for backward compatibility but callers should migrate.
|
|
###############################################################################
|
|
set -o pipefail
|
|
|
|
# Export Python path
|
|
export PYTHONPATH=".."
|
|
|
|
###############################################################################
|
|
# Helper Functions
|
|
###############################################################################
|
|
|
|
wait_for_clean_gpus() {
|
|
local timeout=${1:-300}
|
|
local start=$SECONDS
|
|
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
|
|
while true; do
|
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
echo "GPUs state is \"clean\""
|
|
return
|
|
fi
|
|
if (( SECONDS - start >= timeout )); then
|
|
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
|
|
exit 1
|
|
fi
|
|
sleep 3
|
|
done
|
|
}
|
|
|
|
cleanup_docker() {
|
|
# Get Docker's root directory
|
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
if [ -z "$docker_root" ]; then
|
|
echo "Failed to determine Docker root directory."
|
|
exit 1
|
|
fi
|
|
echo "Docker root directory: $docker_root"
|
|
|
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
threshold=70
|
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
docker image prune -f
|
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
echo "Docker images and volumes cleanup completed."
|
|
else
|
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
fi
|
|
}
|
|
|
|
cleanup_network() {
|
|
local max_nodes=${NUM_NODES:-2}
|
|
for node in $(seq 0 $((max_nodes - 1))); do
|
|
if docker ps -a -q -f name="node${node}" | grep -q .; then
|
|
docker stop "node${node}" || true
|
|
fi
|
|
done
|
|
if docker network ls | grep -q docker-net; then
|
|
docker network rm docker-net || true
|
|
fi
|
|
}
|
|
|
|
is_multi_node() {
|
|
local cmds="$1"
|
|
# Primary signal: NUM_NODES environment variable set by the pipeline
|
|
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
|
|
return 0
|
|
fi
|
|
# Fallback: detect the bracket syntax structurally
|
|
# Pattern: [...] && [...] (per-node command arrays)
|
|
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
handle_pytest_exit() {
|
|
local exit_code=$1
|
|
if [ "$exit_code" -eq 5 ]; then
|
|
echo "Pytest exit code 5 (no tests collected) - treating as success."
|
|
exit 0
|
|
fi
|
|
exit "$exit_code"
|
|
}
|
|
|
|
###############################################################################
|
|
# Pytest marker/keyword re-quoting
|
|
#
|
|
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
|
|
# quotes around multi-word pytest -m/-k expressions get stripped:
|
|
# pytest -v -s -m 'not cpu_test' v1/core
|
|
# becomes:
|
|
# pytest -v -s -m not cpu_test v1/core
|
|
#
|
|
# pytest then interprets "cpu_test" as a file path, not part of the marker.
|
|
#
|
|
# This function detects unquoted expressions after -m/-k and re-quotes them
|
|
# by collecting tokens until a recognizable boundary is reached:
|
|
# - test path (contains '/')
|
|
# - test file (ends with '.py')
|
|
# - another pytest flag (--xxx or -x single-char flags)
|
|
# - command separator (&& || ; |)
|
|
# - environment variable assignment (FOO=bar)
|
|
#
|
|
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
|
|
# unquoted since they have no spaces and work fine.
|
|
#
|
|
# Already-quoted expressions (containing literal single quotes) are passed
|
|
# through untouched to avoid double-quoting values injected by
|
|
# apply_rocm_test_overrides.
|
|
#
|
|
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
|
|
# double-quotes stripped by the calling shell (see header comment).
|
|
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
|
|
###############################################################################
|
|
re_quote_pytest_markers() {
|
|
local input="$1"
|
|
local output=""
|
|
local collecting=false
|
|
local marker_buf=""
|
|
|
|
# Strip backslash-newline continuations, then flatten remaining newlines
|
|
local flat="${input//$'\\\n'/ }"
|
|
flat="${flat//$'\n'/ }"
|
|
|
|
# Disable globbing to prevent *.py etc. from expanding during read -ra
|
|
local restore_glob
|
|
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
|
set -o noglob
|
|
local -a words
|
|
read -ra words <<< "$flat"
|
|
eval "$restore_glob"
|
|
|
|
for word in "${words[@]}"; do
|
|
if $collecting; then
|
|
# If the token we're about to collect already contains a literal
|
|
# single quote, the expression was already quoted upstream.
|
|
# Flush and stop collecting.
|
|
if [[ "$word" == *"'"* ]]; then
|
|
if [[ -n "$marker_buf" ]]; then
|
|
# Should not normally happen (partial buf + quote), flush raw
|
|
output+="${marker_buf} "
|
|
marker_buf=""
|
|
fi
|
|
output+="${word} "
|
|
collecting=false
|
|
continue
|
|
fi
|
|
|
|
local is_boundary=false
|
|
case "$word" in
|
|
# Line-continuation artifact
|
|
"\\")
|
|
is_boundary=true ;;
|
|
# Command separators
|
|
"&&"|"||"|";"|"|")
|
|
is_boundary=true ;;
|
|
# Long flags (--ignore, --shard-id, etc.)
|
|
--*)
|
|
is_boundary=true ;;
|
|
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
|
|
# like "not" which don't start with "-". Also skip -k/-m which
|
|
# would start a new marker (handled below).
|
|
-[a-zA-Z])
|
|
is_boundary=true ;;
|
|
# Test path (contains /)
|
|
*/*)
|
|
is_boundary=true ;;
|
|
# Test file (ends with .py, possibly with ::method)
|
|
*.py|*.py::*)
|
|
is_boundary=true ;;
|
|
# Environment variable assignment preceding a command (FOO=bar)
|
|
*=*)
|
|
# Only treat as boundary if it looks like VAR=value, not
|
|
# pytest filter expressions like num_gpus=2 inside markers
|
|
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
|
is_boundary=true
|
|
fi
|
|
;;
|
|
esac
|
|
|
|
if $is_boundary; then
|
|
# Strip surrounding double quotes if present (from upstream
|
|
# single-to-double conversion); without this, wrapping below
|
|
# would produce '"expr"' with literal double-quote characters.
|
|
if [[ "$marker_buf" == '"'*'"' ]]; then
|
|
marker_buf="${marker_buf#\"}"
|
|
marker_buf="${marker_buf%\"}"
|
|
fi
|
|
# Flush the collected marker expression
|
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
|
output+="'${marker_buf}' "
|
|
else
|
|
output+="${marker_buf} "
|
|
fi
|
|
collecting=false
|
|
marker_buf=""
|
|
# Check if this boundary word itself starts a new -m/-k
|
|
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
|
output+="${word} "
|
|
collecting=true
|
|
# Drop stray backslash tokens silently
|
|
elif [[ "$word" == "\\" ]]; then
|
|
:
|
|
else
|
|
output+="${word} "
|
|
fi
|
|
else
|
|
# Accumulate into marker buffer
|
|
if [[ -n "$marker_buf" ]]; then
|
|
marker_buf+=" ${word}"
|
|
else
|
|
marker_buf="${word}"
|
|
fi
|
|
fi
|
|
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
|
output+="${word} "
|
|
collecting=true
|
|
marker_buf=""
|
|
else
|
|
output+="${word} "
|
|
fi
|
|
done
|
|
|
|
# Flush any trailing marker expression (marker at end of command)
|
|
if $collecting && [[ -n "$marker_buf" ]]; then
|
|
# Strip surrounding double quotes (see mid-stream flush comment)
|
|
if [[ "$marker_buf" == '"'*'"' ]]; then
|
|
marker_buf="${marker_buf#\"}"
|
|
marker_buf="${marker_buf%\"}"
|
|
fi
|
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
|
output+="'${marker_buf}'"
|
|
else
|
|
output+="${marker_buf}"
|
|
fi
|
|
fi
|
|
|
|
echo "${output% }"
|
|
}
|
|
|
|
###############################################################################
|
|
# ROCm-specific pytest command rewrites
|
|
#
|
|
# These apply ignore flags and environment overrides for tests that are not
|
|
# yet supported or behave differently on ROCm hardware. Kept as a single
|
|
# function so new exclusions are easy to add in one place.
|
|
###############################################################################
|
|
|
|
apply_rocm_test_overrides() {
|
|
local cmds="$1"
|
|
|
|
# --- Model registry filter ---
|
|
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
|
|
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
|
fi
|
|
|
|
# --- LoRA: disable custom paged attention ---
|
|
if [[ $cmds == *"pytest -v -s lora"* ]]; then
|
|
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
|
fi
|
|
|
|
# --- Kernel ignores ---
|
|
if [[ $cmds == *" kernels/core"* ]]; then
|
|
cmds="${cmds} \
|
|
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
|
--ignore=kernels/core/test_permute_cols.py"
|
|
fi
|
|
|
|
if [[ $cmds == *" kernels/attention"* ]]; then
|
|
cmds="${cmds} \
|
|
--ignore=kernels/attention/test_attention_selector.py \
|
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
--ignore=kernels/attention/test_flash_attn.py \
|
|
--ignore=kernels/attention/test_flashinfer.py \
|
|
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
--ignore=kernels/attention/test_mha_attn.py \
|
|
--ignore=kernels/attention/test_lightning_attn.py \
|
|
--ignore=kernels/attention/test_attention.py"
|
|
fi
|
|
|
|
if [[ $cmds == *" kernels/quantization"* ]]; then
|
|
cmds="${cmds} \
|
|
--ignore=kernels/quantization/test_int8_quant.py \
|
|
--ignore=kernels/quantization/test_machete_mm.py \
|
|
--ignore=kernels/quantization/test_block_fp8.py \
|
|
--ignore=kernels/quantization/test_block_int8.py \
|
|
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
fi
|
|
|
|
if [[ $cmds == *" kernels/mamba"* ]]; then
|
|
cmds="${cmds} \
|
|
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
fi
|
|
|
|
if [[ $cmds == *" kernels/moe"* ]]; then
|
|
cmds="${cmds} \
|
|
--ignore=kernels/moe/test_moe.py \
|
|
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
fi
|
|
|
|
# --- Entrypoint ignores ---
|
|
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
|
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
|
--ignore=entrypoints/openai/chat_completion/test_audio.py \
|
|
--ignore=entrypoints/openai/completion/test_shutdown.py \
|
|
--ignore=entrypoints/openai/test_completion.py \
|
|
--ignore=entrypoints/openai/test_models.py \
|
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
|
|
--ignore=entrypoints/openai/test_tokenization.py \
|
|
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
|
|
fi
|
|
|
|
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
|
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
|
|
--ignore=entrypoints/llm/test_chat.py \
|
|
--ignore=entrypoints/llm/test_accuracy.py \
|
|
--ignore=entrypoints/llm/test_init.py \
|
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
fi
|
|
|
|
# Clean up escaped newlines from --ignore appends
|
|
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
|
|
|
|
echo "$cmds"
|
|
}
|
|
|
|
###############################################################################
|
|
# Main
|
|
###############################################################################
|
|
|
|
# --- GPU initialization ---
|
|
echo "--- Confirming Clean Initial State"
|
|
wait_for_clean_gpus
|
|
|
|
echo "--- ROCm info"
|
|
rocminfo
|
|
|
|
# --- Docker housekeeping ---
|
|
cleanup_docker
|
|
|
|
echo "--- Resetting GPUs"
|
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
|
wait_for_clean_gpus
|
|
|
|
# --- Pull test image ---
|
|
echo "--- Pulling container"
|
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
docker pull "${image_name}"
|
|
|
|
remove_docker_container() {
|
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
|
}
|
|
trap remove_docker_container EXIT
|
|
|
|
# --- Prepare commands ---
|
|
echo "--- Running container"
|
|
|
|
HF_CACHE="$(realpath ~)/huggingface"
|
|
mkdir -p "${HF_CACHE}"
|
|
HF_MOUNT="/root/.cache/huggingface"
|
|
|
|
# ---- Command source selection ----
|
|
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
|
|
# Fall back to $* for backward compatibility, but warn that inner
|
|
# double-quotes will have been stripped by the calling shell.
|
|
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
|
commands="${VLLM_TEST_COMMANDS}"
|
|
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
|
else
|
|
commands="$*"
|
|
if [[ -z "$commands" ]]; then
|
|
echo "Error: No test commands provided." >&2
|
|
echo "Usage:" >&2
|
|
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
|
|
echo " Legacy: bash $0 \"commands here\"" >&2
|
|
exit 1
|
|
fi
|
|
echo "Commands sourced from positional args (legacy mode)"
|
|
echo "WARNING: Inner double-quotes in the command string may have been"
|
|
echo " stripped by the calling shell. If you see syntax errors, switch to:"
|
|
echo " export VLLM_TEST_COMMANDS='your commands here'"
|
|
echo " bash $0"
|
|
fi
|
|
|
|
echo "Raw commands: $commands"
|
|
|
|
# Fix quoting before ROCm overrides (so overrides see correct structure)
|
|
commands=$(re_quote_pytest_markers "$commands")
|
|
echo "After re-quoting: $commands"
|
|
|
|
commands=$(apply_rocm_test_overrides "$commands")
|
|
echo "Final commands: $commands"
|
|
|
|
MYPYTHONPATH=".."
|
|
|
|
# Verify GPU access
|
|
render_gid=$(getent group render | cut -d: -f3)
|
|
if [[ -z "$render_gid" ]]; then
|
|
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
|
exit 1
|
|
fi
|
|
|
|
# --- RDMA device passthrough (conditional) ---
|
|
# If the host has RDMA devices, pass them through so tests like
|
|
# test_moriio_connector can access ibverbs. On hosts without RDMA
|
|
# hardware the tests will gracefully skip via _rdma_available().
|
|
RDMA_FLAGS=""
|
|
if [ -d /dev/infiniband ]; then
|
|
echo "RDMA devices detected on host, enabling passthrough"
|
|
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
|
|
else
|
|
echo "No RDMA devices found on host, RDMA tests will be skipped"
|
|
fi
|
|
|
|
# --- Route: multi-node vs single-node ---
|
|
if is_multi_node "$commands"; then
|
|
echo "--- Multi-node job detected"
|
|
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
|
|
|
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
|
|
# BASH_REMATCH[1] = prefix (everything before first bracket)
|
|
# BASH_REMATCH[2] = comma-separated node0 commands
|
|
# BASH_REMATCH[3] = comma-separated node1 commands
|
|
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
|
|
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
|
echo "PREFIX: ${prefix}"
|
|
|
|
export composite_command="(command rocm-smi || true)"
|
|
saved_IFS=$IFS
|
|
IFS=','
|
|
read -ra node0 <<< "${BASH_REMATCH[2]}"
|
|
read -ra node1 <<< "${BASH_REMATCH[3]}"
|
|
IFS=$saved_IFS
|
|
|
|
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
|
|
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
|
|
fi
|
|
|
|
for i in "${!node0[@]}"; do
|
|
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
|
|
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
|
|
|
|
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
|
echo "COMMANDS: ${step_cmd}"
|
|
composite_command="${composite_command} && ${step_cmd}"
|
|
done
|
|
|
|
/bin/bash -c "${composite_command}"
|
|
exit_code=$?
|
|
cleanup_network
|
|
handle_pytest_exit "$exit_code"
|
|
else
|
|
echo "Multi-node job detected but failed to parse bracket command syntax."
|
|
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
|
|
echo "Got: $commands"
|
|
cleanup_network
|
|
exit 111
|
|
fi
|
|
else
|
|
echo "--- Single-node job"
|
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
docker run \
|
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
$RDMA_FLAGS \
|
|
--network=host \
|
|
--shm-size=16gb \
|
|
--group-add "$render_gid" \
|
|
--rm \
|
|
-e HF_TOKEN \
|
|
-e AWS_ACCESS_KEY_ID \
|
|
-e AWS_SECRET_ACCESS_KEY \
|
|
-e BUILDKITE_PARALLEL_JOB \
|
|
-e BUILDKITE_PARALLEL_JOB_COUNT \
|
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
-e "HF_HOME=${HF_MOUNT}" \
|
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
--name "${container_name}" \
|
|
"${image_name}" \
|
|
/bin/bash -c "${commands}"
|
|
|
|
exit_code=$?
|
|
handle_pytest_exit "$exit_code"
|
|
fi
|