/** * Debug TMA descriptor creation — try both driver and runtime API approaches. */ #include #include #include #include typedef unsigned short bf16_t; int main() { printf("=== TMA Descriptor Debug ===\n"); // Force CUDA context creation via runtime API cudaFree(0); // This ensures a context is active // Check driver API version int driver_version = 0; cuDriverGetVersion(&driver_version); printf("CUDA driver version: %d\n", driver_version); // Get current context CUcontext ctx; CUresult ctx_res = cuCtxGetCurrent(&ctx); printf("cuCtxGetCurrent: %d, ctx=%p\n", (int)ctx_res, (void*)ctx); // Allocate with cudaMalloc bf16_t* d_data; cudaError_t alloc_res = cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t)); printf("cudaMalloc: %s (err=%d)\n", cudaGetErrorString(alloc_res), (int)alloc_res); // Try cuTensorMapEncodeTiled uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {1, 16}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap desc; CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("cuTensorMapEncodeTiled (128x16): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); // Try with cuMemAlloc instead CUdeviceptr cu_data; CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t)); printf("cuMemAlloc: %s (err=%d)\n", cu_alloc==CUDA_SUCCESS?"OK":"FAIL", (int)cu_alloc); if (cu_alloc == CUDA_SUCCESS) { res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("cuTensorMapEncodeTiled with cuMemAlloc ptr: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res); cuMemFree(cu_data); } // Check if the pointer is 128B aligned printf("d_data ptr: %p, 128B aligned: %s\n", (void*)d_data, ((uintptr_t)d_data % 128 == 0) ? "yes" : "no"); // Check the actual error description const char* err_str = nullptr; cuGetErrorString(res, &err_str); printf("Error string: %s\n", err_str ? err_str : "unknown"); cudaFree(d_data); printf("Done.\n"); return 0; }