diff --git a/tests/unit/test_p4_tma_load_test.cu b/tests/unit/test_p4_tma_load_test.cu new file mode 100644 index 00000000..02e7f687 --- /dev/null +++ b/tests/unit/test_p4_tma_load_test.cu @@ -0,0 +1,169 @@ +/** + * P4: Test actual TMA loads with different descriptors. + * + * Creates TMA descriptors with various swizzle/OOB configs, + * launches a kernel that does cp.async.bulk.tensor.2d with each, + * and checks if the load completes (mbarrier signals) or hangs. + * + * The existing FMHA kernel's TMA loads work (they use CuTeDSL's + * TMA path which creates descriptors with swizzle). The raw CUDA + * path with NO swizzle hangs. This test identifies which field + * causes the hang. + */ +#include +#include +#include +#include +#include + +// Maximum wait iterations before declaring a hang +#define MAX_WAIT 1000000 + +__global__ void tma_load_test_kernel( + const void* tma_desc_ptr, // 128-byte TMA descriptor in GMEM + void* smem_out, // SMEM buffer for TMA output (256 bytes) + int* result // GMEM: 0=pending, 1=success, -1=hang +) { + // Set up mbarrier in SMEM + __shared__ uint64_t mbar; + if (threadIdx.x == 0) { + // Initialize mbarrier with expected count = 1 (one TMA load) + asm volatile("mbarrier.init.shared.b64 [%0], 1;" :: "r"(__cvta_generic_to_shared(&mbar))); + asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory"); + } + __syncthreads(); + + // Only thread 0 issues TMA + if (threadIdx.x == 0) { + // TMA load: 16x16 BF16 tile = 512 bytes + // cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes + asm volatile( + "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes " + "[%0], [%1, {%3, %4}], [%2];" + :: "r"(__cvta_generic_to_shared(smem_out)), + "l"(tma_desc_ptr), + "r"(__cvta_generic_to_shared(&mbar)), + "r"(0), // coord row = 0 + "r"(0) // coord col = 0 + ); + } + __syncthreads(); + + // Wait for mbarrier to complete (TMA arrival) + if (threadIdx.x == 0) { + int waited = 0; + int arrived = 0; + while (waited < MAX_WAIT) { + uint32_t state; + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + "mbarrier.try_wait.parity.shared.b64 p, [%0], 0;\n\t" + "selp.b32 %1, 1, 0, p;\n\t" + "}" + : "=r"(state) + : "r"(__cvta_generic_to_shared(&mbar)) + ); + if (state) { arrived = 1; break; } + waited++; + } + if (arrived) { + *result = 1; // success + } else { + *result = -1; // hang + } + } +} + + +CUtensorMap create_descriptor(void* d_ptr, int swizzle, int oob_fill) { + CUtensorMap desc; + cuuint64_t globalDim[] = {128, 16}; + cuuint64_t globalStrides[] = {16 * 2, 2}; + cuuint32_t boxDim[] = {16, 16}; + cuuint32_t elementStrides[] = {1, 1}; + + CUtensorMapSwizzle sw = (swizzle == 0) ? CU_TENSOR_MAP_SWIZZLE_NONE : CU_TENSOR_MAP_SWIZZLE_128B; + CUtensorMapFloatOOBfill oob = (oob_fill == 0) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE + : CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA; + + CUresult res = cuTensorMapEncodeTiled(&desc, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + d_ptr, globalDim, globalStrides, boxDim, elementStrides, + CU_TENSOR_MAP_INTERLEAVE_NONE, sw, + CU_TENSOR_MAP_L2_PROMOTION_NONE, oob); + + if (res != CUDA_SUCCESS) { + printf(" Descriptor creation FAILED: %d\n", res); + } + return desc; +} + + +int main() { + const int ROWS = 128; + const int COLS = 16; + const size_t DATA_SIZE = ROWS * COLS * 2; + const size_t SMEM_SIZE = 512; // 16x16 BF16 = 512 bytes + + // Allocate source data + void* d_data; + cudaMalloc(&d_data, DATA_SIZE); + cudaMemset(d_data, 1, DATA_SIZE); // Fill with non-zero data + + // Allocate result + int* d_result; + cudaMalloc(&d_result, sizeof(int)); + cudaMemset(d_result, 0, sizeof(int)); + + // Test configs + struct { const char* name; int swizzle; int oob; } configs[] = { + {"NO swizzle, OOB_NONE", 0, 0}, + {"SWIZZLE_128B, OOB_NONE", 1, 0}, + {"NO swizzle, OOB_FILL_ZERO", 0, 1}, + {"SWIZZLE_128B, OOB_FILL_ZERO", 1, 1}, + }; + + for (int i = 0; i < 4; i++) { + printf("Testing: %s\n", configs[i].name); + + // Create descriptor and copy to GMEM + CUtensorMap desc = create_descriptor(d_data, configs[i].swizzle, configs[i].oob); + void* d_desc; + cudaMalloc(&d_desc, sizeof(CUtensorMap)); + cudaMemcpy(d_desc, &desc, sizeof(CUtensorMap), cudaMemcpyHostToDevice); + + // Allocate SMEM output + void* d_smem_out; + cudaMalloc(&d_smem_out, SMEM_SIZE); + + // Reset result + cudaMemset(d_result, 0, sizeof(int)); + + // Launch with timeout + tma_load_test_kernel<<<1, 32, SMEM_SIZE + 64>>>(d_desc, d_smem_out, d_result); + + // Check result with a short timeout on host + cudaError_t err = cudaDeviceSynchronize(); + int h_result; + cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost); + + if (err != cudaSuccess) { + printf(" HANG or ERROR: %s (result=%d)\n", cudaGetErrorString(err), h_result); + } else if (h_result == 1) { + printf(" SUCCESS: TMA load completed\n"); + } else if (h_result == -1) { + printf(" HANG: mbarrier never signaled\n"); + } else { + printf(" UNKNOWN: result=%d\n", h_result); + } + + cudaFree(d_desc); + cudaFree(d_smem_out); + } + + cudaFree(d_data); + cudaFree(d_result); + printf("\nPASSED\n"); + return 0; +}