diff --git a/dsv4/kernels/attention/fmha_tma.cuh b/dsv4/kernels/attention/fmha_tma.cuh
index 72af54ba..fa8589e3 100644
--- a/dsv4/kernels/attention/fmha_tma.cuh
+++ b/dsv4/kernels/attention/fmha_tma.cuh
@@ -145,6 +145,11 @@ inline bool create_tma_desc_2d_bf16(
         CU_TENSOR_MAP_L2_PROMOTION_NONE,
         CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
     );
+    if (res != CUDA_SUCCESS) {
+        fprintf(stderr, "cuTensorMapEncodeTiled failed: error=%d, gdim=[%lu,%lu], gstr=[%lu,%lu], tdim=[%u,%u], tstr=[%u,%u]\n",
+            (int)res, global_dim[0], global_dim[1], global_str[0], global_str[1],
+            tile_dim[0], tile_dim[1], tile_str[0], tile_str[1]);
+    }
     return res == CUDA_SUCCESS;
 }
 
diff --git a/tests/unit/test_fmha_tma.cu b/tests/unit/test_fmha_tma.cu
index 7eb49fee..43a4b3d8 100644
--- a/tests/unit/test_fmha_tma.cu
+++ b/tests/unit/test_fmha_tma.cu
@@ -159,8 +159,10 @@ struct TmaDescSet {
         // The data in GMEM starts at d_q, shape (T, HD), stride (HD, 1).
         // We treat it as (128, HD) — rows beyond T are garbage, kernel ignores them.
         uint32_t q_tile_rows = 128;
-        if (!create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd)) {
-            printf("  Failed to create Q TMA desc\n"); return false;
+        CUresult q_res = create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd);
+        if (q_res != true) {
+            printf("  Failed to create Q TMA desc: rows=128, cols=%d, tile_rows=128, tile_cols=%d\n", hd, hd);
+            return false;
         }
 
         // K: (s_k, HD) — TMA tile = (16, s_k) to load one K sub-tile at a time