diff --git a/tests/unit/test_p4_tma_descriptor_dump.cu b/tests/unit/test_p4_tma_descriptor_dump.cu index c8ab4c23..f07cdcfe 100644 --- a/tests/unit/test_p4_tma_descriptor_dump.cu +++ b/tests/unit/test_p4_tma_descriptor_dump.cu @@ -2,20 +2,17 @@ * P4: Dump TMA descriptor bytes for comparison. * CUDA 13.2 compatible. * - * API signature on CUDA 13.2: - * cuTensorMapEncodeTiled(CUtensorMap*, cuuint32_t rank, - * CUtensorMapDataType, void*, - * cuuint64_t* tensorDims, cuuint64_t* globalStrides, - * cuuint32_t* boxDims, cuuint32_t* elementStrides, - * CUtensorMapInterleave, CUtensorMapSwizzle, CUtensorMapL2promotion) - * - * Note: NO OOB fill parameter on CUDA 13.2 (10 params, not 11). + * Signature: cuTensorMapEncodeTiled( + * CUtensorMap*, CUtensorMapDataType, cuuint32_t tensorRank, + * void*, cuuint64_t* globalDim, cuuint64_t* globalStrides, + * cuuint32_t* boxDim, cuuint32_t* elementStrides, + * CUtensorMapInterleave, CUtensorMapSwizzle, + * CUtensorMapL2promotion, CUtensorMapFloatOOBfill) */ #include #include #include #include -#include int main() { const int ROWS = 128; @@ -26,15 +23,19 @@ int main() { cudaMalloc(&d_ptr, SIZE); cudaMemset(d_ptr, 0, SIZE); - cuuint64_t tensorDims[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS}; + // globalDim: tensor dimensions (ROWS, COLS) in elements + cuuint64_t globalDim[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS}; + // globalStrides: byte strides between rows and between elements cuuint64_t globalStrides[] = {(cuuint64_t)(COLS * 2), (cuuint64_t)2}; - cuuint32_t boxDims[] = {16, 16}; + // boxDim: TMA tile dimensions (16, 16) + cuuint32_t boxDim[] = {16, 16}; + // elementStrides: (1, 1) = contiguous cuuint32_t elementStrides[] = {1, 1}; CUtensorMap tma_desc; CUresult res; - auto dump_desc = [](const char* label, const CUtensorMap& desc) { + auto dump = [](const char* label, const CUtensorMap& desc) { printf("=== %s ===\n", label); auto* b = reinterpret_cast(&desc); for (int i = 0; i < 128; i += 16) { @@ -44,21 +45,41 @@ int main() { } }; - // 1: NO swizzle - res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, - d_ptr, tensorDims, globalStrides, boxDims, elementStrides, + // 1: NO swizzle, OOB_NONE + res = cuTensorMapEncodeTiled(&tma_desc, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + d_ptr, globalDim, globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE); - if (res == CUDA_SUCCESS) dump_desc("NO swizzle", tma_desc); - else printf("=== NO swizzle: FAILED (%d) ===\n", res); + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + if (res == CUDA_SUCCESS) dump("NO swizzle, OOB_NONE", tma_desc); + else printf("=== NO swizzle, OOB_NONE: FAILED (%d) ===\n", res); - // 2: SWIZZLE_128B - res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, - d_ptr, tensorDims, globalStrides, boxDims, elementStrides, + // 2: SWIZZLE_128B, OOB_NONE + res = cuTensorMapEncodeTiled(&tma_desc, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + d_ptr, globalDim, globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, - CU_TENSOR_MAP_L2_PROMOTION_NONE); - if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B", tma_desc); - else printf("=== SWIZZLE_128B: FAILED (%d) ===\n", res); + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + if (res == CUDA_SUCCESS) dump("SWIZZLE_128B, OOB_NONE", tma_desc); + else printf("=== SWIZZLE_128B, OOB_NONE: FAILED (%d) ===\n", res); + + // 3: NO swizzle, OOB_FILL_ZERO + res = cuTensorMapEncodeTiled(&tma_desc, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + d_ptr, globalDim, globalStrides, boxDim, elementStrides, + CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_ZERO); + if (res == CUDA_SUCCESS) dump("NO swizzle, OOB_FILL_ZERO", tma_desc); + else printf("=== NO swizzle, OOB_FILL_ZERO: FAILED (%d) ===\n", res); + + // 4: SWIZZLE_128B, OOB_FILL_ZERO + res = cuTensorMapEncodeTiled(&tma_desc, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + d_ptr, globalDim, globalStrides, boxDim, elementStrides, + CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_ZERO); + if (res == CUDA_SUCCESS) dump("SWIZZLE_128B, OOB_FILL_ZERO", tma_desc); + else printf("=== SWIZZLE_128B, OOB_FILL_ZERO: FAILED (%d) ===\n", res); cudaFree(d_ptr); printf("\nPASSED\n");