add CUDA test runner script (screen-based, follows harness pattern)
This commit is contained in:
75
tests/run_cuda_test.sh
Executable file
75
tests/run_cuda_test.sh
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/bin/bash
|
||||
# Run the standalone CUDA FMHA test on the B200 using screen sessions.
|
||||
# Follows the test harness pattern from README.md.
|
||||
# Usage: bash tests/run_cuda_test.sh
|
||||
set -e
|
||||
|
||||
B200="root@45.76.247.107"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4"
|
||||
PASS='6)Jr)B@dcX[mN?dx'
|
||||
REPO_DIR="/root/dsv4-nvfp4-workspace/kernel"
|
||||
VENV="/root/dsv4-nvfp4-workspace/venv/bin/activate"
|
||||
|
||||
echo "=== Running CUDA test on B200 via screen harness ==="
|
||||
|
||||
sshpass -p "$PASS" ssh $SSH_OPTS $B200 bash -s <<'REMOTE_SCRIPT'
|
||||
set -e
|
||||
REPO_DIR="/root/dsv4-nvfp4-workspace/kernel"
|
||||
|
||||
# --- CLEANUP (same as run_test.sh) ---
|
||||
if screen -list 2>/dev/null | grep -q kernel-test; then
|
||||
session_pid=$(screen -ls | grep kernel-test | grep -o '[0-9]*' | head -1)
|
||||
if [ -n "$session_pid" ]; then
|
||||
pkill -9 -P "$session_pid" 2>/dev/null || true
|
||||
fi
|
||||
screen -S kernel-test -X quit 2>/dev/null || true
|
||||
fi
|
||||
pkill -9 -f test_fmha 2>/dev/null || true
|
||||
pkill -9 -f test_tmem 2>/dev/null || true
|
||||
sleep 2
|
||||
|
||||
# Delete old log
|
||||
rm -f /tmp/kernel-test.log
|
||||
|
||||
# --- PULL ---
|
||||
cd $REPO_DIR
|
||||
git checkout -- . 2>/dev/null || true
|
||||
git clean -fd 2>/dev/null || true
|
||||
git pull
|
||||
|
||||
# --- COMPILE + RUN in screen ---
|
||||
export PATH=/usr/local/cuda-13.2/bin:$PATH
|
||||
|
||||
# Compile
|
||||
nvcc -std=c++20 -gencode=arch=compute_100a,code=sm_100a \
|
||||
-I$REPO_DIR \
|
||||
$REPO_DIR/tests/unit/test_fmha_sm100_standalone.cu \
|
||||
-o /tmp/test_fmha_sm100 -lcudart 2>&1 | tee /tmp/kernel-test.log
|
||||
|
||||
echo "" >> /tmp/kernel-test.log
|
||||
echo "=== Running test ===" >> /tmp/kernel-test.log
|
||||
|
||||
# Run in screen (survives SSH drops)
|
||||
screen -dmS kernel-test bash -c "timeout 60 /tmp/test_fmha_sm100 >> /tmp/kernel-test.log 2>&1; echo 'EXIT_CODE=$?' >> /tmp/kernel-test.log"
|
||||
sleep 3
|
||||
|
||||
if screen -list | grep -q kernel-test; then
|
||||
echo "OK: screen kernel-test is running"
|
||||
else
|
||||
echo "FAIL: screen did not start. Log below:"
|
||||
cat /tmp/kernel-test.log 2>/dev/null
|
||||
exit 1
|
||||
fi
|
||||
REMOTE_SCRIPT
|
||||
|
||||
echo "=== Test launched. Polling for results... ==="
|
||||
while true; do
|
||||
RESULT=$(sshpass -p "$PASS" ssh $SSH_OPTS $B200 "screen -list 2>/dev/null | grep -q kernel-test && echo running || echo done" 2>/dev/null || echo "done")
|
||||
if [ "$RESULT" != "running" ]; then
|
||||
echo "=== Screen finished. Results: ==="
|
||||
sshpass -p "$PASS" ssh $SSH_OPTS $B200 "cat /tmp/kernel-test.log"
|
||||
exit 0
|
||||
fi
|
||||
echo " ...still running..."
|
||||
sleep 10
|
||||
done
|
||||
Reference in New Issue
Block a user