Files
nvfp4-megamoe-kernel/tests/unit/test_tma_desc_debug3.cu

90 lines
3.4 KiB
Plaintext

/**
* Debug TMA descriptor creation — fix context issues.
*/
#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>
#include <cstring>
typedef unsigned short bf16_t;
int main() {
printf("=== TMA Descriptor Debug (context fix) ===\n");
// Method 1: Use runtime API to create context, then get it for driver API
cudaFree(0);
// Check if driver API can see the runtime context
CUcontext rt_ctx = nullptr;
CUresult r1 = cuCtxGetCurrent(&rt_ctx);
printf("cuCtxGetCurrent after cudaFree(0): err=%d, ctx=%p\n", (int)r1, (void*)rt_ctx);
// Method 2: Create a primary context via driver API
CUdevice device;
cuDeviceGet(&device, 0);
CUcontext primary_ctx;
CUresult r2 = cuDevicePrimaryCtxRetain(&primary_ctx, device);
printf("cuDevicePrimaryCtxRetain: err=%d, ctx=%p\n", (int)r2, (void*)primary_ctx);
// Set the primary context current
CUresult r3 = cuCtxSetCurrent(primary_ctx);
printf("cuCtxSetCurrent: err=%d\n", (int)r3);
// Now try cuTensorMapEncodeTiled
bf16_t* d_data;
cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t));
printf("cudaMalloc: %s\n", cudaGetErrorString(cudaGetLastError()));
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {1, 16};
uint32_t tdim[] = {16, 128};
uint32_t tstr[] = {1, 16};
CUtensorMap desc;
printf("\n--- Test with UINT16 ---\n");
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("cuTensorMapEncodeTiled UINT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
if (res != CUDA_SUCCESS) {
const char* err_str = nullptr;
cuGetErrorString(res, &err_str);
printf("Error: %s\n", err_str ? err_str : "unknown");
}
printf("\n--- Test with BFLOAT16 ---\n");
res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data,
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("cuTensorMapEncodeTiled BFLOAT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
if (res != CUDA_SUCCESS) {
const char* err_str = nullptr;
cuGetErrorString(res, &err_str);
printf("Error: %s\n", err_str ? err_str : "unknown");
}
// Try with cuMemAlloc instead of cudaMalloc
printf("\n--- Test with cuMemAlloc ---\n");
CUdeviceptr cu_data;
CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t));
printf("cuMemAlloc: err=%d\n", (int)cu_alloc);
if (cu_alloc == CUDA_SUCCESS) {
res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data,
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("cuTensorMapEncodeTiled with cuMemAlloc: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
if (res != CUDA_SUCCESS) {
const char* err_str = nullptr;
cuGetErrorString(res, &err_str);
printf("Error: %s\n", err_str ? err_str : "unknown");
}
cuMemFree(cu_data);
}
cudaFree(d_data);
cuDevicePrimaryCtxRelease(device);
printf("Done.\n");
return 0;
}