From 85cd95e609aaa64091b6dab6d0b2c38fb8ef7065 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 29 May 2026 04:45:54 +0000 Subject: [PATCH] debug: TMA context fix test --- tests/unit/test_tma_desc_debug3.cu | 89 ++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 tests/unit/test_tma_desc_debug3.cu diff --git a/tests/unit/test_tma_desc_debug3.cu b/tests/unit/test_tma_desc_debug3.cu new file mode 100644 index 00000000..f2e52689 --- /dev/null +++ b/tests/unit/test_tma_desc_debug3.cu @@ -0,0 +1,89 @@ +/** + * Debug TMA descriptor creation — fix context issues. + */ + +#include +#include +#include +#include + +typedef unsigned short bf16_t; + +int main() { + printf("=== TMA Descriptor Debug (context fix) ===\n"); + + // Method 1: Use runtime API to create context, then get it for driver API + cudaFree(0); + + // Check if driver API can see the runtime context + CUcontext rt_ctx = nullptr; + CUresult r1 = cuCtxGetCurrent(&rt_ctx); + printf("cuCtxGetCurrent after cudaFree(0): err=%d, ctx=%p\n", (int)r1, (void*)rt_ctx); + + // Method 2: Create a primary context via driver API + CUdevice device; + cuDeviceGet(&device, 0); + CUcontext primary_ctx; + CUresult r2 = cuDevicePrimaryCtxRetain(&primary_ctx, device); + printf("cuDevicePrimaryCtxRetain: err=%d, ctx=%p\n", (int)r2, (void*)primary_ctx); + + // Set the primary context current + CUresult r3 = cuCtxSetCurrent(primary_ctx); + printf("cuCtxSetCurrent: err=%d\n", (int)r3); + + // Now try cuTensorMapEncodeTiled + bf16_t* d_data; + cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t)); + printf("cudaMalloc: %s\n", cudaGetErrorString(cudaGetLastError())); + + uint64_t gdim[] = {16, 128}; + uint64_t gstr[] = {1, 16}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 16}; + CUtensorMap desc; + + printf("\n--- Test with UINT16 ---\n"); + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("cuTensorMapEncodeTiled UINT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + if (res != CUDA_SUCCESS) { + const char* err_str = nullptr; + cuGetErrorString(res, &err_str); + printf("Error: %s\n", err_str ? err_str : "unknown"); + } + + printf("\n--- Test with BFLOAT16 ---\n"); + res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("cuTensorMapEncodeTiled BFLOAT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + if (res != CUDA_SUCCESS) { + const char* err_str = nullptr; + cuGetErrorString(res, &err_str); + printf("Error: %s\n", err_str ? err_str : "unknown"); + } + + // Try with cuMemAlloc instead of cudaMalloc + printf("\n--- Test with cuMemAlloc ---\n"); + CUdeviceptr cu_data; + CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t)); + printf("cuMemAlloc: err=%d\n", (int)cu_alloc); + if (cu_alloc == CUDA_SUCCESS) { + res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("cuTensorMapEncodeTiled with cuMemAlloc: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + if (res != CUDA_SUCCESS) { + const char* err_str = nullptr; + cuGetErrorString(res, &err_str); + printf("Error: %s\n", err_str ? err_str : "unknown"); + } + cuMemFree(cu_data); + } + + cudaFree(d_data); + cuDevicePrimaryCtxRelease(device); + printf("Done.\n"); + return 0; +}