diff --git a/tests/unit/test_p4_tma_load_test.cu b/tests/unit/test_p4_tma_load_test.cu
new file mode 100644
index 00000000..02e7f687
--- /dev/null
+++ b/tests/unit/test_p4_tma_load_test.cu
@@ -0,0 +1,169 @@
+/**
+ * P4: Test actual TMA loads with different descriptors.
+ *
+ * Creates TMA descriptors with various swizzle/OOB configs,
+ * launches a kernel that does cp.async.bulk.tensor.2d with each,
+ * and checks if the load completes (mbarrier signals) or hangs.
+ *
+ * The existing FMHA kernel's TMA loads work (they use CuTeDSL's
+ * TMA path which creates descriptors with swizzle). The raw CUDA
+ * path with NO swizzle hangs. This test identifies which field
+ * causes the hang.
+ */
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+
+// Maximum wait iterations before declaring a hang
+#define MAX_WAIT 1000000
+
+__global__ void tma_load_test_kernel(
+    const void* tma_desc_ptr,    // 128-byte TMA descriptor in GMEM
+    void* smem_out,              // SMEM buffer for TMA output (256 bytes)
+    int* result                  // GMEM: 0=pending, 1=success, -1=hang
+) {
+    // Set up mbarrier in SMEM
+    __shared__ uint64_t mbar;
+    if (threadIdx.x == 0) {
+        // Initialize mbarrier with expected count = 1 (one TMA load)
+        asm volatile("mbarrier.init.shared.b64 [%0], 1;" :: "r"(__cvta_generic_to_shared(&mbar)));
+        asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory");
+    }
+    __syncthreads();
+
+    // Only thread 0 issues TMA
+    if (threadIdx.x == 0) {
+        // TMA load: 16x16 BF16 tile = 512 bytes
+        // cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
+        asm volatile(
+            "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
+            "[%0], [%1, {%3, %4}], [%2];"
+            :: "r"(__cvta_generic_to_shared(smem_out)),
+               "l"(tma_desc_ptr),
+               "r"(__cvta_generic_to_shared(&mbar)),
+               "r"(0),   // coord row = 0
+               "r"(0)    // coord col = 0
+        );
+    }
+    __syncthreads();
+
+    // Wait for mbarrier to complete (TMA arrival)
+    if (threadIdx.x == 0) {
+        int waited = 0;
+        int arrived = 0;
+        while (waited < MAX_WAIT) {
+            uint32_t state;
+            asm volatile(
+                "{\n\t"
+                ".reg .pred p;\n\t"
+                "mbarrier.try_wait.parity.shared.b64 p, [%0], 0;\n\t"
+                "selp.b32 %1, 1, 0, p;\n\t"
+                "}"
+                : "=r"(state)
+                : "r"(__cvta_generic_to_shared(&mbar))
+            );
+            if (state) { arrived = 1; break; }
+            waited++;
+        }
+        if (arrived) {
+            *result = 1;  // success
+        } else {
+            *result = -1; // hang
+        }
+    }
+}
+
+
+CUtensorMap create_descriptor(void* d_ptr, int swizzle, int oob_fill) {
+    CUtensorMap desc;
+    cuuint64_t globalDim[] = {128, 16};
+    cuuint64_t globalStrides[] = {16 * 2, 2};
+    cuuint32_t boxDim[] = {16, 16};
+    cuuint32_t elementStrides[] = {1, 1};
+
+    CUtensorMapSwizzle sw = (swizzle == 0) ? CU_TENSOR_MAP_SWIZZLE_NONE : CU_TENSOR_MAP_SWIZZLE_128B;
+    CUtensorMapFloatOOBfill oob = (oob_fill == 0) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+                                                    : CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
+
+    CUresult res = cuTensorMapEncodeTiled(&desc,
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
+        d_ptr, globalDim, globalStrides, boxDim, elementStrides,
+        CU_TENSOR_MAP_INTERLEAVE_NONE, sw,
+        CU_TENSOR_MAP_L2_PROMOTION_NONE, oob);
+
+    if (res != CUDA_SUCCESS) {
+        printf("  Descriptor creation FAILED: %d\n", res);
+    }
+    return desc;
+}
+
+
+int main() {
+    const int ROWS = 128;
+    const int COLS = 16;
+    const size_t DATA_SIZE = ROWS * COLS * 2;
+    const size_t SMEM_SIZE = 512;  // 16x16 BF16 = 512 bytes
+
+    // Allocate source data
+    void* d_data;
+    cudaMalloc(&d_data, DATA_SIZE);
+    cudaMemset(d_data, 1, DATA_SIZE);  // Fill with non-zero data
+
+    // Allocate result
+    int* d_result;
+    cudaMalloc(&d_result, sizeof(int));
+    cudaMemset(d_result, 0, sizeof(int));
+
+    // Test configs
+    struct { const char* name; int swizzle; int oob; } configs[] = {
+        {"NO swizzle, OOB_NONE", 0, 0},
+        {"SWIZZLE_128B, OOB_NONE", 1, 0},
+        {"NO swizzle, OOB_FILL_ZERO", 0, 1},
+        {"SWIZZLE_128B, OOB_FILL_ZERO", 1, 1},
+    };
+
+    for (int i = 0; i < 4; i++) {
+        printf("Testing: %s\n", configs[i].name);
+
+        // Create descriptor and copy to GMEM
+        CUtensorMap desc = create_descriptor(d_data, configs[i].swizzle, configs[i].oob);
+        void* d_desc;
+        cudaMalloc(&d_desc, sizeof(CUtensorMap));
+        cudaMemcpy(d_desc, &desc, sizeof(CUtensorMap), cudaMemcpyHostToDevice);
+
+        // Allocate SMEM output
+        void* d_smem_out;
+        cudaMalloc(&d_smem_out, SMEM_SIZE);
+
+        // Reset result
+        cudaMemset(d_result, 0, sizeof(int));
+
+        // Launch with timeout
+        tma_load_test_kernel<<<1, 32, SMEM_SIZE + 64>>>(d_desc, d_smem_out, d_result);
+
+        // Check result with a short timeout on host
+        cudaError_t err = cudaDeviceSynchronize();
+        int h_result;
+        cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
+
+        if (err != cudaSuccess) {
+            printf("  HANG or ERROR: %s (result=%d)\n", cudaGetErrorString(err), h_result);
+        } else if (h_result == 1) {
+            printf("  SUCCESS: TMA load completed\n");
+        } else if (h_result == -1) {
+            printf("  HANG: mbarrier never signaled\n");
+        } else {
+            printf("  UNKNOWN: result=%d\n", h_result);
+        }
+
+        cudaFree(d_desc);
+        cudaFree(d_smem_out);
+    }
+
+    cudaFree(d_data);
+    cudaFree(d_result);
+    printf("\nPASSED\n");
+    return 0;
+}