debug: add TMA descriptor error reporting
This commit is contained in:
@@ -145,6 +145,11 @@ inline bool create_tma_desc_2d_bf16(
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
|
||||
);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
fprintf(stderr, "cuTensorMapEncodeTiled failed: error=%d, gdim=[%lu,%lu], gstr=[%lu,%lu], tdim=[%u,%u], tstr=[%u,%u]\n",
|
||||
(int)res, global_dim[0], global_dim[1], global_str[0], global_str[1],
|
||||
tile_dim[0], tile_dim[1], tile_str[0], tile_str[1]);
|
||||
}
|
||||
return res == CUDA_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -159,8 +159,10 @@ struct TmaDescSet {
|
||||
// The data in GMEM starts at d_q, shape (T, HD), stride (HD, 1).
|
||||
// We treat it as (128, HD) — rows beyond T are garbage, kernel ignores them.
|
||||
uint32_t q_tile_rows = 128;
|
||||
if (!create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd)) {
|
||||
printf(" Failed to create Q TMA desc\n"); return false;
|
||||
CUresult q_res = create_tma_desc_2d_bf16(&tma_q, d_q, 128, (uint64_t)hd, q_tile_rows, (uint32_t)hd);
|
||||
if (q_res != true) {
|
||||
printf(" Failed to create Q TMA desc: rows=128, cols=%d, tile_rows=128, tile_cols=%d\n", hd, hd);
|
||||
return false;
|
||||
}
|
||||
|
||||
// K: (s_k, HD) — TMA tile = (16, s_k) to load one K sub-tile at a time
|
||||
|
||||
Reference in New Issue
Block a user