From 4898a946eb04426b74df29a90610a4e24c257d19 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 30 May 2026 08:33:12 +0000 Subject: [PATCH] P4: fix TMA descriptor dump API order (dtype before rank) --- tests/unit/test_p4_tma_descriptor_dump.cu | 105 +++++++++------------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/tests/unit/test_p4_tma_descriptor_dump.cu b/tests/unit/test_p4_tma_descriptor_dump.cu index c118dccf..fa806e9f 100644 --- a/tests/unit/test_p4_tma_descriptor_dump.cu +++ b/tests/unit/test_p4_tma_descriptor_dump.cu @@ -1,8 +1,6 @@ /** * P4: Dump TMA descriptor bytes for comparison. - * - * Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16 - * tensor with various swizzle modes, and dumps the 128-byte descriptor. + * Uses CUDA Driver API directly. */ #include #include @@ -10,6 +8,14 @@ #include #include +/* CUDA 13.2 enum values for cuTensorMapEncodeTiled: + * CUtensorMapDataType: 0=F16, 6=BF16 + * CUtensorMapInterleave: 0=NONE, 1=16B, 2=32B + * CUtensorMapSwizzle: 0=NONE, 1=4B, 2=32B, 3=64B, 4=128B + * CUtensorMapL2Promotion: 0=NONE, 1=64B, 2=128B, 3=256B + * CUtensorMapOOBFill: 0=NONE, 1=ZERO + */ + int main() { const int ROWS = 128; const int COLS = 16; @@ -19,43 +25,34 @@ int main() { cudaMalloc(&d_ptr, SIZE); cudaMemset(d_ptr, 0, SIZE); - // cuTensorMapEncodeTiled signature: - // CUtensorMap*, cuuint32_t rank, CUtensorMapDataType, - // void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*, - // CUtensorMapInterleave, CUtensorMapSwizzle, - // CUtensorMapL2Promotion, CUtensorMapOOBFill - - cuuint64_t tensorDims[] = {ROWS, COLS}; - cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides + cuuint64_t tensorDims[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS}; + cuuint64_t globalStrides[] = {(cuuint64_t)(COLS * 2), (cuuint64_t)2}; cuuint32_t boxDims[] = {16, 16}; cuuint32_t elementStrides[] = {1, 1}; CUtensorMap tma_desc; CUresult res; - // ================================================================== // Descriptor 1: NO swizzle - // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, - 2, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + (CUtensorMapDataType)6, // BF16 + 2, // rank d_ptr, tensorDims, globalStrides, boxDims, elementStrides, - CU_TENSOR_MAP_INTERLEAVE_NONE, - CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE, - CU_TENSOR_MAP_OOB_FILL_NONE + (CUtensorMapInterleave)0, // NONE + (CUtensorMapSwizzle)0, // NONE + (CUtensorMapL2Promotion)0, // NONE + (CUtensorMapOOBFill)0 // NONE ); printf("=== Descriptor 1: NO swizzle ===\n"); - if (res != CUDA_SUCCESS) { - printf("FAILED: %d\n", res); - } else { - const uint8_t* bytes = reinterpret_cast(&tma_desc); + if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } + else { + auto* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { printf("[%3d-%3d]: ", i, i+15); for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); @@ -63,29 +60,26 @@ int main() { } } - // ================================================================== // Descriptor 2: SWIZZLE_128B - // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, + (CUtensorMapDataType)6, 2, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, - CU_TENSOR_MAP_INTERLEAVE_NONE, - CU_TENSOR_MAP_SWIZZLE_128B, - CU_TENSOR_MAP_L2_PROMOTION_NONE, - CU_TENSOR_MAP_OOB_FILL_NONE + (CUtensorMapInterleave)0, + (CUtensorMapSwizzle)4, // 128B + (CUtensorMapL2Promotion)0, + (CUtensorMapOOBFill)0 ); printf("\n=== Descriptor 2: SWIZZLE_128B ===\n"); - if (res != CUDA_SUCCESS) { - printf("FAILED: %d\n", res); - } else { - const uint8_t* bytes = reinterpret_cast(&tma_desc); + if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } + else { + auto* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { printf("[%3d-%3d]: ", i, i+15); for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); @@ -93,29 +87,26 @@ int main() { } } - // ================================================================== // Descriptor 3: NO swizzle, OOB_FILL_ZERO - // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, + (CUtensorMapDataType)6, 2, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, - CU_TENSOR_MAP_INTERLEAVE_NONE, - CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE, - CU_TENSOR_MAP_OOB_FILL_ZERO + (CUtensorMapInterleave)0, + (CUtensorMapSwizzle)0, + (CUtensorMapL2Promotion)0, + (CUtensorMapOOBFill)1 // ZERO ); printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n"); - if (res != CUDA_SUCCESS) { - printf("FAILED: %d\n", res); - } else { - const uint8_t* bytes = reinterpret_cast(&tma_desc); + if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } + else { + auto* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { printf("[%3d-%3d]: ", i, i+15); for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); @@ -123,29 +114,26 @@ int main() { } } - // ================================================================== // Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO - // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, + (CUtensorMapDataType)6, 2, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, globalStrides, boxDims, elementStrides, - CU_TENSOR_MAP_INTERLEAVE_NONE, - CU_TENSOR_MAP_SWIZZLE_128B, - CU_TENSOR_MAP_L2_PROMOTION_NONE, - CU_TENSOR_MAP_OOB_FILL_ZERO + (CUtensorMapInterleave)0, + (CUtensorMapSwizzle)4, + (CUtensorMapL2Promotion)0, + (CUtensorMapOOBFill)1 ); printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n"); - if (res != CUDA_SUCCESS) { - printf("FAILED: %d\n", res); - } else { - const uint8_t* bytes = reinterpret_cast(&tma_desc); + if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); } + else { + auto* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { printf("[%3d-%3d]: ", i, i+15); for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); @@ -153,11 +141,6 @@ int main() { } } - // Also: test an actual TMA load to see which descriptors work - printf("\n=== TMA load test (see separate kernel test) ===\n"); - printf("Descriptor creation OK for all variants.\n"); - printf("Actual TMA load behavior requires launching a kernel.\n"); - cudaFree(d_ptr); printf("\nPASSED\n"); return 0;