From d8ffdb66e1e405e1cedcf70a0adf19406cd6ff3c Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 30 May 2026 08:35:04 +0000 Subject: [PATCH] P4: fix API signature rank/dtype order, OOB_FILL defines --- tests/unit/test_p4_tma_descriptor_dump.cu | 118 +++++++++++----------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/tests/unit/test_p4_tma_descriptor_dump.cu b/tests/unit/test_p4_tma_descriptor_dump.cu index ce89ee4b..2a71221e 100644 --- a/tests/unit/test_p4_tma_descriptor_dump.cu +++ b/tests/unit/test_p4_tma_descriptor_dump.cu @@ -1,5 +1,6 @@ /** * P4: Dump TMA descriptor bytes for comparison. + * CUDA 13.2 compatible — uses correct API signature. */ #include #include @@ -7,6 +8,33 @@ #include #include +/* CUDA 13.2 cuTensorMapEncodeTiled signature: + * CUresult cuTensorMapEncodeTiled( + * CUtensorMap *tensorMap, + * cuuint32_t tensorRank, + * CUtensorMapDataType dataType, + * void *globalAddress, + * const cuuint64_t *tensorDims, + * const cuuint64_t *globalStrides, + * const cuuint32_t *boxDims, + * const cuuint32_t *elementStrides, + * CUtensorMapInterleave interleave, + * CUtensorMapSwizzle swizzle, + * CUtensorMapL2promotion l2Promotion, + * CUtensorMapOOBfill oobFill + * ); + * + * Note: OOB fill is CUtensorMapOOBfill (lowercase f) in CUDA 13.2 + */ + +// Define missing enum values if needed +#ifndef CU_TENSOR_MAP_OOB_FILL_NONE +#define CU_TENSOR_MAP_OOB_FILL_NONE ((CUtensorMapOOBfill)0) +#endif +#ifndef CU_TENSOR_MAP_OOB_FILL_ZERO +#define CU_TENSOR_MAP_OOB_FILL_ZERO ((CUtensorMapOOBfill)1) +#endif + int main() { const int ROWS = 128; const int COLS = 16; @@ -24,77 +52,47 @@ int main() { CUtensorMap tma_desc; CUresult res; - // Config 1: NO swizzle - printf("=== Descriptor 1: NO swizzle ===\n"); - res = cuTensorMapEncodeTiled( - &tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + auto dump_desc = [](const char* label, const CUtensorMap& desc) { + printf("=== %s ===\n", label); + auto* b = reinterpret_cast(&desc); + for (int i = 0; i < 128; i += 16) { + printf("[%3d-%3d]: ", i, i+15); + for (int j = 0; j < 16; j++) printf("%02x ", b[i+j]); + printf("\n"); + } + }; + + // 1: NO swizzle + res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE - ); - if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } - else { - auto* b = reinterpret_cast(&tma_desc); - for (int i = 0; i < 128; i += 16) { - printf("[%3d-%3d]: ", i, i+15); - for (int j = 0; j < 16; j++) printf("%02x ", b[i+j]); - printf("\n"); - } - } + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE); + if (res == CUDA_SUCCESS) dump_desc("NO swizzle", tma_desc); + else printf("=== NO swizzle: FAILED (%d) ===\n", res); - // Config 2: SWIZZLE_128B - printf("\n=== Descriptor 2: SWIZZLE_128B ===\n"); - res = cuTensorMapEncodeTiled( - &tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + // 2: SWIZZLE_128B + res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, - CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE - ); - if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } - else { - auto* b = reinterpret_cast(&tma_desc); - for (int i = 0; i < 128; i += 16) { - printf("[%3d-%3d]: ", i, i+15); - for (int j = 0; j < 16; j++) printf("%02x ", b[i+j]); - printf("\n"); - } - } + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE); + if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B", tma_desc); + else printf("=== SWIZZLE_128B: FAILED (%d) ===\n", res); - // Config 3: NO swizzle, OOB_FILL_ZERO - printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n"); - res = cuTensorMapEncodeTiled( - &tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + // 3: NO swizzle, OOB_FILL_ZERO + res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO - ); - if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } - else { - auto* b = reinterpret_cast(&tma_desc); - for (int i = 0; i < 128; i += 16) { - printf("[%3d-%3d]: ", i, i+15); - for (int j = 0; j < 16; j++) printf("%02x ", b[i+j]); - printf("\n"); - } - } + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO); + if (res == CUDA_SUCCESS) dump_desc("NO swizzle + OOB_FILL_ZERO", tma_desc); + else printf("=== NO swizzle + OOB_FILL_ZERO: FAILED (%d) ===\n", res); - // Config 4: SWIZZLE_128B, OOB_FILL_ZERO - printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n"); - res = cuTensorMapEncodeTiled( - &tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + // 4: SWIZZLE_128B, OOB_FILL_ZERO + res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B, - CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO - ); - if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } - else { - auto* b = reinterpret_cast(&tma_desc); - for (int i = 0; i < 128; i += 16) { - printf("[%3d-%3d]: ", i, i+15); - for (int j = 0; j < 16; j++) printf("%02x ", b[i+j]); - printf("\n"); - } - } + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO); + if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B + OOB_FILL_ZERO", tma_desc); + else printf("=== SWIZZLE_128B + OOB_FILL_ZERO: FAILED (%d) ===\n", res); cudaFree(d_ptr); printf("\nPASSED\n");