add CUDA test runner script (screen-based, follows harness pattern)

This commit is contained in:
2026-05-28 07:31:41 +00:00
parent 2eb44a00bf
commit cec505ce14

75
tests/run_cuda_test.sh Executable file
View File

@@ -0,0 +1,75 @@
#!/bin/bash
# Run the standalone CUDA FMHA test on the B200 using screen sessions.
# Follows the test harness pattern from README.md.
# Usage: bash tests/run_cuda_test.sh
set -e
B200="root@45.76.247.107"
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4"
PASS='6)Jr)B@dcX[mN?dx'
REPO_DIR="/root/dsv4-nvfp4-workspace/kernel"
VENV="/root/dsv4-nvfp4-workspace/venv/bin/activate"
echo "=== Running CUDA test on B200 via screen harness ==="
sshpass -p "$PASS" ssh $SSH_OPTS $B200 bash -s <<'REMOTE_SCRIPT'
set -e
REPO_DIR="/root/dsv4-nvfp4-workspace/kernel"
# --- CLEANUP (same as run_test.sh) ---
if screen -list 2>/dev/null | grep -q kernel-test; then
session_pid=$(screen -ls | grep kernel-test | grep -o '[0-9]*' | head -1)
if [ -n "$session_pid" ]; then
pkill -9 -P "$session_pid" 2>/dev/null || true
fi
screen -S kernel-test -X quit 2>/dev/null || true
fi
pkill -9 -f test_fmha 2>/dev/null || true
pkill -9 -f test_tmem 2>/dev/null || true
sleep 2
# Delete old log
rm -f /tmp/kernel-test.log
# --- PULL ---
cd $REPO_DIR
git checkout -- . 2>/dev/null || true
git clean -fd 2>/dev/null || true
git pull
# --- COMPILE + RUN in screen ---
export PATH=/usr/local/cuda-13.2/bin:$PATH
# Compile
nvcc -std=c++20 -gencode=arch=compute_100a,code=sm_100a \
-I$REPO_DIR \
$REPO_DIR/tests/unit/test_fmha_sm100_standalone.cu \
-o /tmp/test_fmha_sm100 -lcudart 2>&1 | tee /tmp/kernel-test.log
echo "" >> /tmp/kernel-test.log
echo "=== Running test ===" >> /tmp/kernel-test.log
# Run in screen (survives SSH drops)
screen -dmS kernel-test bash -c "timeout 60 /tmp/test_fmha_sm100 >> /tmp/kernel-test.log 2>&1; echo 'EXIT_CODE=$?' >> /tmp/kernel-test.log"
sleep 3
if screen -list | grep -q kernel-test; then
echo "OK: screen kernel-test is running"
else
echo "FAIL: screen did not start. Log below:"
cat /tmp/kernel-test.log 2>/dev/null
exit 1
fi
REMOTE_SCRIPT
echo "=== Test launched. Polling for results... ==="
while true; do
RESULT=$(sshpass -p "$PASS" ssh $SSH_OPTS $B200 "screen -list 2>/dev/null | grep -q kernel-test && echo running || echo done" 2>/dev/null || echo "done")
if [ "$RESULT" != "running" ]; then
echo "=== Screen finished. Results: ==="
sshpass -p "$PASS" ssh $SSH_OPTS $B200 "cat /tmp/kernel-test.log"
exit 0
fi
echo " ...still running..."
sleep 10
done