diff --git a/dsv4/kernels/attention/fmha_tma.cuh b/dsv4/kernels/attention/fmha_tma.cuh index 72af54ba..fa8589e3 100644 --- a/dsv4/kernels/attention/fmha_tma.cuh +++ b/dsv4/kernels/attention/fmha_tma.cuh @@ -145,6 +145,11 @@ inline bool create_tma_desc_2d_bf16( CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE ); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuTensorMapEncodeTiled failed: error=%d, gdim=[%lu,%lu], gstr=[%lu,%lu], tdim=[%u,%u], tstr=[%u,%u]\n", + (int)res, global_dim[0], global_dim[1], global_str[0], global_str[1], + tile_dim[0], tile_dim[1], tile_str[0], tile_str[1]); + } return res == CUDA_SUCCESS; } diff --git a/tests/unit/test_fmha_tma.cu b/tests/unit/test_fmha_tma.cu index 7eb49fee..43a4b3d8 100644 --- a/tests/unit/test_fmha_tma.cu +++ b/tests/unit/test_fmha_tma.cu @@ -159,8 +159,10 @@ struct TmaDescSet { // The data in GMEM starts at d_q, shape (T, HD), stride (HD, 1). // We treat it as (128, HD) — rows beyond T are garbage, kernel ignores them. uint32_t q_tile_rows = 128; - if (!create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd)) { - printf(" Failed to create Q TMA desc\n"); return false; + CUresult q_res = create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd); + if (q_res != true) { + printf(" Failed to create Q TMA desc: rows=128, cols=%d, tile_rows=128, tile_cols=%d\n", hd, hd); + return false; } // K: (s_k, HD) — TMA tile = (16, s_k) to load one K sub-tile at a time