diff --git a/tests/unit/test_p4_tma_descriptor_dump.cu b/tests/unit/test_p4_tma_descriptor_dump.cu index 2708b26d..c118dccf 100644 --- a/tests/unit/test_p4_tma_descriptor_dump.cu +++ b/tests/unit/test_p4_tma_descriptor_dump.cu @@ -1,47 +1,44 @@ /** * P4: Dump TMA descriptor bytes for comparison. * - * Creates a TMA descriptor using cuTensorMapEncodeTiled for a (128,16) BF16 + * Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16 * tensor with various swizzle modes, and dumps the 128-byte descriptor. - * - * The working CuTeDSL path on the B200 can be used to create a matching - * descriptor via a small Python script. This test dumps the raw CUDA path. - * - * Usage: - * fire_b200_cuda_test tests/unit/test_p4_tma_descriptor_dump.cu */ -#include #include +#include #include #include #include -__global__ void dummy_kernel() {} - int main() { - // Allocate a (128, 16) BF16 tensor on GPU const int ROWS = 128; const int COLS = 16; - const size_t SIZE = ROWS * COLS * 2; // 4096 bytes + const size_t SIZE = ROWS * COLS * 2; void* d_ptr; cudaMalloc(&d_ptr, SIZE); cudaMemset(d_ptr, 0, SIZE); + // cuTensorMapEncodeTiled signature: + // CUtensorMap*, cuuint32_t rank, CUtensorMapDataType, + // void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*, + // CUtensorMapInterleave, CUtensorMapSwizzle, + // CUtensorMapL2Promotion, CUtensorMapOOBFill + + cuuint64_t tensorDims[] = {ROWS, COLS}; + cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides + cuuint32_t boxDims[] = {16, 16}; + cuuint32_t elementStrides[] = {1, 1}; + CUtensorMap tma_desc; CUresult res; // ================================================================== - // Descriptor 1: NO swizzle (the one that hangs in cp.async.bulk.tensor) + // Descriptor 1: NO swizzle // ================================================================== - uint32_t tensorDims[] = {ROWS, COLS}; - uint64_t globalStrides[] = {COLS * 2, 2}; // bytes - uint32_t boxDims[] = {16, 16}; - uint32_t elementStrides[] = {1, 1}; - res = cuTensorMapEncodeTiled( &tma_desc, - 2, // rank + 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, d_ptr, tensorDims, @@ -56,7 +53,7 @@ int main() { printf("=== Descriptor 1: NO swizzle ===\n"); if (res != CUDA_SUCCESS) { - printf("cuTensorMapEncodeTiled FAILED: %d\n", res); + printf("FAILED: %d\n", res); } else { const uint8_t* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { @@ -67,7 +64,7 @@ int main() { } // ================================================================== - // Descriptor 2: SWIZZLE_128B (the one CuTeDSL uses for canonical layout) + // Descriptor 2: SWIZZLE_128B // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, @@ -86,7 +83,7 @@ int main() { printf("\n=== Descriptor 2: SWIZZLE_128B ===\n"); if (res != CUDA_SUCCESS) { - printf("cuTensorMapEncodeTiled FAILED: %d\n", res); + printf("FAILED: %d\n", res); } else { const uint8_t* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { @@ -97,40 +94,7 @@ int main() { } // ================================================================== - // Descriptor 3: Different globalStrides (element strides vs byte strides) - // CUDA 13 might need element strides, not byte strides - // ================================================================== - // Try with globalStrides in ELEMENTS (not bytes) - uint64_t globalStrides_elem[] = {COLS, 1}; // elements, not bytes - res = cuTensorMapEncodeTiled( - &tma_desc, - 2, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, - d_ptr, - tensorDims, - globalStrides_elem, - boxDims, - elementStrides, - CU_TENSOR_MAP_INTERLEAVE_NONE, - CU_TENSOR_MAP_SWIZZLE_NONE, - CU_TENSOR_MAP_L2_PROMOTION_NONE, - CU_TENSOR_MAP_OOB_FILL_NONE - ); - - printf("\n=== Descriptor 3: NO swizzle, element strides (not byte) ===\n"); - if (res != CUDA_SUCCESS) { - printf("cuTensorMapEncodeTiled FAILED: %d\n", res); - } else { - const uint8_t* bytes = reinterpret_cast(&tma_desc); - for (int i = 0; i < 128; i += 16) { - printf("[%3d-%3d]: ", i, i+15); - for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); - printf("\n"); - } - } - - // ================================================================== - // Descriptor 4: OOB_FILL_ZERO (maybe the hang is caused by OOB access) + // Descriptor 3: NO swizzle, OOB_FILL_ZERO // ================================================================== res = cuTensorMapEncodeTiled( &tma_desc, @@ -147,9 +111,9 @@ int main() { CU_TENSOR_MAP_OOB_FILL_ZERO ); - printf("\n=== Descriptor 4: NO swizzle, OOB_FILL_ZERO ===\n"); + printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n"); if (res != CUDA_SUCCESS) { - printf("cuTensorMapEncodeTiled FAILED: %d\n", res); + printf("FAILED: %d\n", res); } else { const uint8_t* bytes = reinterpret_cast(&tma_desc); for (int i = 0; i < 128; i += 16) { @@ -160,16 +124,39 @@ int main() { } // ================================================================== - // Also test: actual TMA load with descriptor 1 (the one that hangs) + // Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO // ================================================================== - // We can't easily do this in a host test. The hang happens inside - // a kernel when cp.async.bulk.tensor.2d is issued with this descriptor. - // That test exists separately. + res = cuTensorMapEncodeTiled( + &tma_desc, + 2, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + d_ptr, + tensorDims, + globalStrides, + boxDims, + elementStrides, + CU_TENSOR_MAP_INTERLEAVE_NONE, + CU_TENSOR_MAP_SWIZZLE_128B, + CU_TENSOR_MAP_L2_PROMOTION_NONE, + CU_TENSOR_MAP_OOB_FILL_ZERO + ); - printf("\n=== Test: TMA load with descriptor 1 (NO swizzle) ===\n"); - // Quick test: launch a kernel that does TMA load - // This is the test that hangs — we just verify the descriptor creates OK - printf("Descriptor creation OK. TMA load test requires separate kernel.\n"); + printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n"); + if (res != CUDA_SUCCESS) { + printf("FAILED: %d\n", res); + } else { + const uint8_t* bytes = reinterpret_cast(&tma_desc); + for (int i = 0; i < 128; i += 16) { + printf("[%3d-%3d]: ", i, i+15); + for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]); + printf("\n"); + } + } + + // Also: test an actual TMA load to see which descriptors work + printf("\n=== TMA load test (see separate kernel test) ===\n"); + printf("Descriptor creation OK for all variants.\n"); + printf("Actual TMA load behavior requires launching a kernel.\n"); cudaFree(d_ptr); printf("\nPASSED\n");