Deadlocked GPU processes ignore SIGHUP from screen -X quit. Now kills the entire process group with SIGKILL, plus a catch-all pkill for any python test_ processes.
52 lines
1.6 KiB
Bash
Executable File
52 lines
1.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Test harness: runs a test in screen, logs to kernel-test.log
|
|
# Usage: ./run_test.sh <test_file>
|
|
# Example: ./run_test.sh tests/unit/test_fmha_v3.py
|
|
|
|
set -e
|
|
|
|
TEST_FILE=$1
|
|
if [ -z "$TEST_FILE" ]; then
|
|
echo "Usage: $0 <test_file>"
|
|
exit 1
|
|
fi
|
|
|
|
# --- CLEANUP ---
|
|
# Kill the screen AND every process inside it (handles deadlocked GPU procs)
|
|
if screen -list | grep -q kernel-test; then
|
|
echo "Killing existing kernel-test screen and children..."
|
|
# Find PIDs belonging to the screen session and SIGKILL them
|
|
session_pid=$(screen -ls | grep kernel-test | grep -o '[0-9]*' | head -1)
|
|
if [ -n "$session_pid" ]; then
|
|
# Kill the entire process group (screen's children)
|
|
pkill -9 -P "$session_pid" 2>/dev/null || true
|
|
# Also nuke any python test process just in case
|
|
pkill -9 -f "python.*test_" 2>/dev/null || true
|
|
fi
|
|
screen -S kernel-test -X quit 2>/dev/null || true
|
|
sleep 2
|
|
fi
|
|
# Belt and suspenders: kill any leftover python test processes
|
|
pkill -9 -f "python.*test_" 2>/dev/null || true
|
|
sleep 1
|
|
|
|
rm -f /tmp/kernel-test.log
|
|
|
|
# --- START ---
|
|
cd /root/dsv4-nvfp4-workspace/kernel
|
|
source /root/dsv4-nvfp4-workspace/venv/bin/activate
|
|
export PYTHONPATH=/root/dsv4-nvfp4-workspace/kernel
|
|
|
|
echo "Running: python -u $TEST_FILE"
|
|
echo "Log: /tmp/kernel-test.log"
|
|
screen -dmS kernel-test bash -c 'python -u '"$TEST_FILE"' > /tmp/kernel-test.log 2>&1'
|
|
sleep 2
|
|
|
|
if screen -list | grep -q kernel-test; then
|
|
echo "OK: screen kernel-test is running"
|
|
else
|
|
echo "FAIL: screen did not start. Log below:"
|
|
cat /tmp/kernel-test.log 2>/dev/null
|
|
exit 1
|
|
fi
|