diff --git a/tests/unit/test_tma_desc_debug.cu b/tests/unit/test_tma_desc_debug.cu new file mode 100644 index 00000000..db16f449 --- /dev/null +++ b/tests/unit/test_tma_desc_debug.cu @@ -0,0 +1,125 @@ +/** + * Minimal TMA descriptor creation test. + * Tests various tile dimensions to find what cuTensorMapEncodeTiled accepts. + */ + +#include +#include +#include + +typedef unsigned short bf16_t; + +int main() { + printf("=== TMA Descriptor Creation Test ===\n"); + + bf16_t* d_data; + cudaMalloc(&d_data, 128 * 64 * sizeof(bf16_t)); // (128, 64) BF16 + + // Test 1: (128, 16) global, (16, 128) tile — same as working test + { + uint64_t gdim[] = {16, 128}; + uint64_t gstr[] = {1, 16}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 16}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 1 (128x16 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 2: (64, 128) global, (16, 128) tile, tstr=[1,64] + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 64}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 2 (128x64 global, 16x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 3: (64, 128) global, (16, 128) tile, tstr=[1,16] (tile-local stride) + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 16}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 3 (128x64 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 4: (64, 128) global, (16, 128) tile — try element type UINT8 + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 64}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT8, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 4 (UINT8, 128x64, 16x128, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 5: Same as working test but with global (64, 128) + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {64, 128}; + uint32_t tstr[] = {1, 64}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 5 (128x64 global, 64x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 6: (64, 128) global, (32, 128) tile (tile cols = 32) + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {32, 128}; + uint32_t tstr[] = {1, 64}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 6 (128x64 global, 32x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 7: (64, 128) global, (16, 128) tile, UINT16, tstr=[1,64], SWIZZLE_128B + { + uint64_t gdim[] = {64, 128}; + uint64_t gstr[] = {1, 64}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 64}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 7 (128x64, 16x128, SW128): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + // Test 8: (16, 128) global, (16, 128) tile — EXACT working test dimensions + { + uint64_t gdim[] = {16, 128}; + uint64_t gstr[] = {1, 16}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 16}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("Test 8 (128x16 global, 16x128 tile, tstr=[1,16] — exact working): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + } + + cudaFree(d_data); + printf("Done.\n"); + return 0; +}