From cec505ce1435b43f65586dfdbe5d6329ca2c4302 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 07:31:41 +0000 Subject: [PATCH] add CUDA test runner script (screen-based, follows harness pattern) --- tests/run_cuda_test.sh | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 tests/run_cuda_test.sh diff --git a/tests/run_cuda_test.sh b/tests/run_cuda_test.sh new file mode 100755 index 00000000..911de345 --- /dev/null +++ b/tests/run_cuda_test.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Run the standalone CUDA FMHA test on the B200 using screen sessions. +# Follows the test harness pattern from README.md. +# Usage: bash tests/run_cuda_test.sh +set -e + +B200="root@45.76.247.107" +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4" +PASS='6)Jr)B@dcX[mN?dx' +REPO_DIR="/root/dsv4-nvfp4-workspace/kernel" +VENV="/root/dsv4-nvfp4-workspace/venv/bin/activate" + +echo "=== Running CUDA test on B200 via screen harness ===" + +sshpass -p "$PASS" ssh $SSH_OPTS $B200 bash -s <<'REMOTE_SCRIPT' +set -e +REPO_DIR="/root/dsv4-nvfp4-workspace/kernel" + +# --- CLEANUP (same as run_test.sh) --- +if screen -list 2>/dev/null | grep -q kernel-test; then + session_pid=$(screen -ls | grep kernel-test | grep -o '[0-9]*' | head -1) + if [ -n "$session_pid" ]; then + pkill -9 -P "$session_pid" 2>/dev/null || true + fi + screen -S kernel-test -X quit 2>/dev/null || true +fi +pkill -9 -f test_fmha 2>/dev/null || true +pkill -9 -f test_tmem 2>/dev/null || true +sleep 2 + +# Delete old log +rm -f /tmp/kernel-test.log + +# --- PULL --- +cd $REPO_DIR +git checkout -- . 2>/dev/null || true +git clean -fd 2>/dev/null || true +git pull + +# --- COMPILE + RUN in screen --- +export PATH=/usr/local/cuda-13.2/bin:$PATH + +# Compile +nvcc -std=c++20 -gencode=arch=compute_100a,code=sm_100a \ + -I$REPO_DIR \ + $REPO_DIR/tests/unit/test_fmha_sm100_standalone.cu \ + -o /tmp/test_fmha_sm100 -lcudart 2>&1 | tee /tmp/kernel-test.log + +echo "" >> /tmp/kernel-test.log +echo "=== Running test ===" >> /tmp/kernel-test.log + +# Run in screen (survives SSH drops) +screen -dmS kernel-test bash -c "timeout 60 /tmp/test_fmha_sm100 >> /tmp/kernel-test.log 2>&1; echo 'EXIT_CODE=$?' >> /tmp/kernel-test.log" +sleep 3 + +if screen -list | grep -q kernel-test; then + echo "OK: screen kernel-test is running" +else + echo "FAIL: screen did not start. Log below:" + cat /tmp/kernel-test.log 2>/dev/null + exit 1 +fi +REMOTE_SCRIPT + +echo "=== Test launched. Polling for results... ===" +while true; do + RESULT=$(sshpass -p "$PASS" ssh $SSH_OPTS $B200 "screen -list 2>/dev/null | grep -q kernel-test && echo running || echo done" 2>/dev/null || echo "done") + if [ "$RESULT" != "running" ]; then + echo "=== Screen finished. Results: ===" + sshpass -p "$PASS" ssh $SSH_OPTS $B200 "cat /tmp/kernel-test.log" + exit 0 + fi + echo " ...still running..." + sleep 10 +done