From 76c82ebdcd7a39ca3f61b9077d7cbf6b479705d9 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 29 May 2026 04:45:06 +0000 Subject: [PATCH] debug: detailed TMA descriptor debug test --- tests/unit/test_tma_desc_debug2.cu | 67 ++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tests/unit/test_tma_desc_debug2.cu diff --git a/tests/unit/test_tma_desc_debug2.cu b/tests/unit/test_tma_desc_debug2.cu new file mode 100644 index 00000000..3e9b95c6 --- /dev/null +++ b/tests/unit/test_tma_desc_debug2.cu @@ -0,0 +1,67 @@ +/** + * Debug TMA descriptor creation — try both driver and runtime API approaches. + */ + +#include +#include +#include +#include + +typedef unsigned short bf16_t; + +int main() { + printf("=== TMA Descriptor Debug ===\n"); + + // Force CUDA context creation via runtime API + cudaFree(0); // This ensures a context is active + + // Check driver API version + int driver_version = 0; + cuDriverGetVersion(&driver_version); + printf("CUDA driver version: %d\n", driver_version); + + // Get current context + CUcontext ctx; + CUresult ctx_res = cuCtxGetCurrent(&ctx); + printf("cuCtxGetCurrent: %d, ctx=%p\n", (int)ctx_res, (void*)ctx); + + // Allocate with cudaMalloc + bf16_t* d_data; + cudaError_t alloc_res = cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t)); + printf("cudaMalloc: %s (err=%d)\n", cudaGetErrorString(alloc_res), (int)alloc_res); + + // Try cuTensorMapEncodeTiled + uint64_t gdim[] = {16, 128}; + uint64_t gstr[] = {1, 16}; + uint32_t tdim[] = {16, 128}; + uint32_t tstr[] = {1, 16}; + CUtensorMap desc; + CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("cuTensorMapEncodeTiled (128x16): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + + // Try with cuMemAlloc instead + CUdeviceptr cu_data; + CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t)); + printf("cuMemAlloc: %s (err=%d)\n", cu_alloc==CUDA_SUCCESS?"OK":"FAIL", (int)cu_alloc); + if (cu_alloc == CUDA_SUCCESS) { + res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data, + gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + printf("cuTensorMapEncodeTiled with cuMemAlloc ptr: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); + cuMemFree(cu_data); + } + + // Check if the pointer is 128B aligned + printf("d_data ptr: %p, 128B aligned: %s\n", (void*)d_data, ((uintptr_t)d_data % 128 == 0) ? "yes" : "no"); + + // Check the actual error description + const char* err_str = nullptr; + cuGetErrorString(res, &err_str); + printf("Error string: %s\n", err_str ? err_str : "unknown"); + + cudaFree(d_data); + printf("Done.\n"); + return 0; +}