diff --git a/tests/unit/test_tma_desc_debug.cu b/tests/unit/test_tma_desc_debug.cu
new file mode 100644
index 00000000..db16f449
--- /dev/null
+++ b/tests/unit/test_tma_desc_debug.cu
@@ -0,0 +1,125 @@
+/**
+ * Minimal TMA descriptor creation test.
+ * Tests various tile dimensions to find what cuTensorMapEncodeTiled accepts.
+ */
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cstdio>
+
+typedef unsigned short bf16_t;
+
+int main() {
+    printf("=== TMA Descriptor Creation Test ===\n");
+
+    bf16_t* d_data;
+    cudaMalloc(&d_data, 128 * 64 * sizeof(bf16_t));  // (128, 64) BF16
+
+    // Test 1: (128, 16) global, (16, 128) tile — same as working test
+    {
+        uint64_t gdim[] = {16, 128};
+        uint64_t gstr[] = {1, 16};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 16};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 1 (128x16 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 2: (64, 128) global, (16, 128) tile, tstr=[1,64]
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 64};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 2 (128x64 global, 16x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 3: (64, 128) global, (16, 128) tile, tstr=[1,16] (tile-local stride)
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 16};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 3 (128x64 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 4: (64, 128) global, (16, 128) tile — try element type UINT8
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 64};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT8, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 4 (UINT8, 128x64, 16x128, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 5: Same as working test but with global (64, 128)
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {64, 128};
+        uint32_t tstr[] = {1, 64};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 5 (128x64 global, 64x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 6: (64, 128) global, (32, 128) tile (tile cols = 32)
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {32, 128};
+        uint32_t tstr[] = {1, 64};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 6 (128x64 global, 32x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 7: (64, 128) global, (16, 128) tile, UINT16, tstr=[1,64], SWIZZLE_128B
+    {
+        uint64_t gdim[] = {64, 128};
+        uint64_t gstr[] = {1, 64};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 64};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 7 (128x64, 16x128, SW128): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    // Test 8: (16, 128) global, (16, 128) tile — EXACT working test dimensions
+    {
+        uint64_t gdim[] = {16, 128};
+        uint64_t gstr[] = {1, 16};
+        uint32_t tdim[] = {16, 128};
+        uint32_t tstr[] = {1, 16};
+        CUtensorMap desc;
+        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("Test 8 (128x16 global, 16x128 tile, tstr=[1,16] — exact working): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    }
+
+    cudaFree(d_data);
+    printf("Done.\n");
+    return 0;
+}