From 0ad35f8be62d30e999ad8b5bd14bee2c4baa83b5 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 30 May 2026 04:40:06 +0000 Subject: [PATCH] debug: add prints to multirow multitile test --- tests/unit/test_fmha_6warp_tma_multirow_multitile.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_fmha_6warp_tma_multirow_multitile.cu b/tests/unit/test_fmha_6warp_tma_multirow_multitile.cu index db4d5bc6..e1341a2d 100644 --- a/tests/unit/test_fmha_6warp_tma_multirow_multitile.cu +++ b/tests/unit/test_fmha_6warp_tma_multirow_multitile.cu @@ -149,11 +149,17 @@ static int test_single(int T, int s_k, int n_h = 1, int batch = 1) { dim3 grid(1, n_h, batch); fmha_6warp_tma_multirow_multitile_kernel<<>>(params); + cudaError_t lerr = cudaGetLastError(); + if (lerr != cudaSuccess) { + printf(" LAUNCH ERROR: %s\n", cudaGetErrorString(lerr)); + return 1; + } cudaError_t err = cudaDeviceSynchronize(); if (err != cudaSuccess) { printf(" CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } + printf(" Kernel completed OK.\n"); cudaMemcpy(h_o, d_o, total_heads * MAX_T * HD * sizeof(bf16_t), cudaMemcpyDeviceToHost); cudaMemcpy(h_lse, d_lse, total_heads * MAX_T * sizeof(float), cudaMemcpyDeviceToHost); @@ -189,6 +195,8 @@ static int test_single(int T, int s_k, int n_h = 1, int batch = 1) { } int main() { + printf("START: test_fmha_6warp_tma_multirow_multitile HD=%d\n", HD); + fflush(stdout); int total_fail = 0; printf("\n=== 6-warp TMA FMHA multi-row multi-tile HD=%d ===\n", HD);