From 16027018df6b7be459d506b8d700fc0b191ef407 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sat, 30 May 2026 08:38:55 +0000
Subject: [PATCH] P4: fix TMA load test (32-bit SMEM addrs, proper mbarrier)

---
 tests/unit/test_p4_tma_load_test.cu | 59 ++++++++++-------------------
 1 file changed, 19 insertions(+), 40 deletions(-)
diff --git a/tests/unit/test_p4_tma_load_test.cu b/tests/unit/test_p4_tma_load_test.cu
index 02e7f687..935775fd 100644
--- a/tests/unit/test_p4_tma_load_test.cu
+++ b/tests/unit/test_p4_tma_load_test.cu
@@ -4,11 +4,6 @@
  * Creates TMA descriptors with various swizzle/OOB configs,
  * launches a kernel that does cp.async.bulk.tensor.2d with each,
  * and checks if the load completes (mbarrier signals) or hangs.
- *
- * The existing FMHA kernel's TMA loads work (they use CuTeDSL's
- * TMA path which creates descriptors with swizzle). The raw CUDA
- * path with NO swizzle hangs. This test identifies which field
- * causes the hang.
  */
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -16,41 +11,43 @@
 #include <cstdint>
 #include <cstring>
 
-// Maximum wait iterations before declaring a hang
 #define MAX_WAIT 1000000
 
 __global__ void tma_load_test_kernel(
-    const void* tma_desc_ptr,    // 128-byte TMA descriptor in GMEM
-    void* smem_out,              // SMEM buffer for TMA output (256 bytes)
-    int* result                  // GMEM: 0=pending, 1=success, -1=hang
+    const void* tma_desc_ptr,
+    int* result
 ) {
-    // Set up mbarrier in SMEM
+    // SMEM: mbarrier (8 bytes aligned) + output buffer (512 bytes)
     __shared__ uint64_t mbar;
+    __shared__ uint16_t smem_out[16 * 16];  // 256 BF16 values
+
     if (threadIdx.x == 0) {
-        // Initialize mbarrier with expected count = 1 (one TMA load)
-        asm volatile("mbarrier.init.shared.b64 [%0], 1;" :: "r"(__cvta_generic_to_shared(&mbar)));
+        uint32_t mbar_addr = __cvta_generic_to_shared(&mbar);
+        asm volatile("mbarrier.init.shared.b64 [%0], 1;" :: "r"(mbar_addr));
         asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory");
     }
     __syncthreads();
 
-    // Only thread 0 issues TMA
     if (threadIdx.x == 0) {
-        // TMA load: 16x16 BF16 tile = 512 bytes
-        // cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
+        uint32_t smem_addr = __cvta_generic_to_shared(smem_out);
+        uint32_t mbar_addr = __cvta_generic_to_shared(&mbar);
+
+        // Issue TMA load: 16x16 BF16 = 512 bytes
         asm volatile(
             "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
             "[%0], [%1, {%3, %4}], [%2];"
-            :: "r"(__cvta_generic_to_shared(smem_out)),
+            :: "r"(smem_addr),
                "l"(tma_desc_ptr),
-               "r"(__cvta_generic_to_shared(&mbar)),
+               "r"(mbar_addr),
                "r"(0),   // coord row = 0
                "r"(0)    // coord col = 0
         );
     }
     __syncthreads();
 
-    // Wait for mbarrier to complete (TMA arrival)
+    // Wait for mbarrier
     if (threadIdx.x == 0) {
+        uint32_t mbar_addr = __cvta_generic_to_shared(&mbar);
         int waited = 0;
         int arrived = 0;
         while (waited < MAX_WAIT) {
@@ -62,16 +59,12 @@ __global__ void tma_load_test_kernel(
                 "selp.b32 %1, 1, 0, p;\n\t"
                 "}"
                 : "=r"(state)
-                : "r"(__cvta_generic_to_shared(&mbar))
+                : "r"(mbar_addr)
             );
             if (state) { arrived = 1; break; }
             waited++;
         }
-        if (arrived) {
-            *result = 1;  // success
-        } else {
-            *result = -1; // hang
-        }
+        *result = arrived ? 1 : -1;
     }
 }
 
@@ -104,19 +97,14 @@ int main() {
     const int ROWS = 128;
     const int COLS = 16;
     const size_t DATA_SIZE = ROWS * COLS * 2;
-    const size_t SMEM_SIZE = 512;  // 16x16 BF16 = 512 bytes
 
-    // Allocate source data
     void* d_data;
     cudaMalloc(&d_data, DATA_SIZE);
-    cudaMemset(d_data, 1, DATA_SIZE);  // Fill with non-zero data
+    cudaMemset(d_data, 1, DATA_SIZE);
 
-    // Allocate result
     int* d_result;
     cudaMalloc(&d_result, sizeof(int));
-    cudaMemset(d_result, 0, sizeof(int));
 
-    // Test configs
     struct { const char* name; int swizzle; int oob; } configs[] = {
         {"NO swizzle, OOB_NONE", 0, 0},
         {"SWIZZLE_128B, OOB_NONE", 1, 0},
@@ -127,23 +115,15 @@ int main() {
     for (int i = 0; i < 4; i++) {
         printf("Testing: %s\n", configs[i].name);
 
-        // Create descriptor and copy to GMEM
         CUtensorMap desc = create_descriptor(d_data, configs[i].swizzle, configs[i].oob);
         void* d_desc;
         cudaMalloc(&d_desc, sizeof(CUtensorMap));
         cudaMemcpy(d_desc, &desc, sizeof(CUtensorMap), cudaMemcpyHostToDevice);
 
-        // Allocate SMEM output
-        void* d_smem_out;
-        cudaMalloc(&d_smem_out, SMEM_SIZE);
-
-        // Reset result
         cudaMemset(d_result, 0, sizeof(int));
 
-        // Launch with timeout
-        tma_load_test_kernel<<<1, 32, SMEM_SIZE + 64>>>(d_desc, d_smem_out, d_result);
+        tma_load_test_kernel<<<1, 32>>>(d_desc, d_result);
 
-        // Check result with a short timeout on host
         cudaError_t err = cudaDeviceSynchronize();
         int h_result;
         cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
@@ -159,7 +139,6 @@ int main() {
         }
 
         cudaFree(d_desc);
-        cudaFree(d_smem_out);
     }
 
     cudaFree(d_data);