/** * Minimal TMA descriptor creation test. * Tests various tile dimensions to find what cuTensorMapEncodeTiled accepts. */ #include #include #include typedef unsigned short bf16_t; int main() { // Initialize CUDA driver API — REQUIRED before any cuTensorMap calls cuInit(0); printf("=== TMA Descriptor Creation Test ===\n"); bf16_t* d_data; cudaMalloc(&d_data, 128 * 64 * sizeof(bf16_t)); // (128, 64) BF16 // Test 1: (128, 16) global, (16, 128) tile — same as working test { uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {1, 16}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 1 (128x16 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 2: (64, 128) global, (16, 128) tile, tstr=[1,64] { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 64}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 2 (128x64 global, 16x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 3: (64, 128) global, (16, 128) tile, tstr=[1,16] (tile-local stride) { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 3 (128x64 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 4: (64, 128) global, (16, 128) tile — try element type UINT8 { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 64}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT8, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 4 (UINT8, 128x64, 16x128, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 5: Same as working test but with global (64, 128) { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {64, 128}; uint32_t tstr[] = {1, 64}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 5 (128x64 global, 64x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 6: (64, 128) global, (32, 128) tile (tile cols = 32) { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {32, 128}; uint32_t tstr[] = {1, 64}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 6 (128x64 global, 32x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 7: (64, 128) global, (16, 128) tile, UINT16, tstr=[1,64], SWIZZLE_128B { uint64_t gdim[] = {64, 128}; uint64_t gstr[] = {1, 64}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 64}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 7 (128x64, 16x128, SW128): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } // Test 8: (16, 128) global, (16, 128) tile — EXACT working test dimensions { uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {1, 16}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("Test 8 (128x16 global, 16x128 tile, tstr=[1,16] — exact working): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); } cudaFree(d_data); printf("Done.\n"); return 0; }