#include #include #include typedef unsigned short bf16_t; int main() { printf("=== CUtensorMap parameter debug (fixed strides) ===\n"); bf16_t* d_data; cudaMalloc(&d_data, 128*16*2); // globalStrides has tensorRank-1 elements (innermost stride is implicit) // Test 1: BF16 2D, gstr=[row_stride_bytes] { uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {32}; // 1 stride: row stride = 16 cols * 2 bytes uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap tma; CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("BF16 (16,128) gstr=[32]: result=%d\n", r); } // Test 2: UINT16 2D { uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {32}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap tma; CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("UINT16 (16,128) gstr=[32]: result=%d\n", r); } // Test 3: Smaller tile (16, 8) { uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {32}; uint32_t tdim[] = {16, 8}; uint32_t tstr[] = {1, 16}; CUtensorMap tma; CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("BF16 tile(16,8): result=%d\n", r); } // Test 4: 1D BF16 (as byte array) { uint64_t gdim[] = {4096}; uint64_t gstr[] = {2}; uint32_t tdim[] = {256}; uint32_t tstr[] = {1}; CUtensorMap tma; CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT8, 1, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("UINT8 1D: result=%d\n", r); } cudaFree(d_data); return 0; }