From 85cd95e609aaa64091b6dab6d0b2c38fb8ef7065 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 29 May 2026 04:45:54 +0000
Subject: [PATCH] debug: TMA context fix test

---
 tests/unit/test_tma_desc_debug3.cu | 89 ++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tests/unit/test_tma_desc_debug3.cu
diff --git a/tests/unit/test_tma_desc_debug3.cu b/tests/unit/test_tma_desc_debug3.cu
new file mode 100644
index 00000000..f2e52689
--- /dev/null
+++ b/tests/unit/test_tma_desc_debug3.cu
@@ -0,0 +1,89 @@
+/**
+ * Debug TMA descriptor creation — fix context issues.
+ */
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cstdio>
+#include <cstring>
+
+typedef unsigned short bf16_t;
+
+int main() {
+    printf("=== TMA Descriptor Debug (context fix) ===\n");
+
+    // Method 1: Use runtime API to create context, then get it for driver API
+    cudaFree(0);
+
+    // Check if driver API can see the runtime context
+    CUcontext rt_ctx = nullptr;
+    CUresult r1 = cuCtxGetCurrent(&rt_ctx);
+    printf("cuCtxGetCurrent after cudaFree(0): err=%d, ctx=%p\n", (int)r1, (void*)rt_ctx);
+
+    // Method 2: Create a primary context via driver API
+    CUdevice device;
+    cuDeviceGet(&device, 0);
+    CUcontext primary_ctx;
+    CUresult r2 = cuDevicePrimaryCtxRetain(&primary_ctx, device);
+    printf("cuDevicePrimaryCtxRetain: err=%d, ctx=%p\n", (int)r2, (void*)primary_ctx);
+
+    // Set the primary context current
+    CUresult r3 = cuCtxSetCurrent(primary_ctx);
+    printf("cuCtxSetCurrent: err=%d\n", (int)r3);
+
+    // Now try cuTensorMapEncodeTiled
+    bf16_t* d_data;
+    cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t));
+    printf("cudaMalloc: %s\n", cudaGetErrorString(cudaGetLastError()));
+
+    uint64_t gdim[] = {16, 128};
+    uint64_t gstr[] = {1, 16};
+    uint32_t tdim[] = {16, 128};
+    uint32_t tstr[] = {1, 16};
+    CUtensorMap desc;
+
+    printf("\n--- Test with UINT16 ---\n");
+    CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
+        gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+    printf("cuTensorMapEncodeTiled UINT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    if (res != CUDA_SUCCESS) {
+        const char* err_str = nullptr;
+        cuGetErrorString(res, &err_str);
+        printf("Error: %s\n", err_str ? err_str : "unknown");
+    }
+
+    printf("\n--- Test with BFLOAT16 ---\n");
+    res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data,
+        gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+    printf("cuTensorMapEncodeTiled BFLOAT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+    if (res != CUDA_SUCCESS) {
+        const char* err_str = nullptr;
+        cuGetErrorString(res, &err_str);
+        printf("Error: %s\n", err_str ? err_str : "unknown");
+    }
+
+    // Try with cuMemAlloc instead of cudaMalloc
+    printf("\n--- Test with cuMemAlloc ---\n");
+    CUdeviceptr cu_data;
+    CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t));
+    printf("cuMemAlloc: err=%d\n", (int)cu_alloc);
+    if (cu_alloc == CUDA_SUCCESS) {
+        res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data,
+            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+        printf("cuTensorMapEncodeTiled with cuMemAlloc: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
+        if (res != CUDA_SUCCESS) {
+            const char* err_str = nullptr;
+            cuGetErrorString(res, &err_str);
+            printf("Error: %s\n", err_str ? err_str : "unknown");
+        }
+        cuMemFree(cu_data);
+    }
+
+    cudaFree(d_data);
+    cuDevicePrimaryCtxRelease(device);
+    printf("Done.\n");
+    return 0;
+}