Files
nvfp4-megamoe-kernel/tests/unit/test_tma_desc_debug2.cu

68 lines
2.5 KiB
Plaintext

/**
* Debug TMA descriptor creation — try both driver and runtime API approaches.
*/
#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>
#include <cstring>
typedef unsigned short bf16_t;
int main() {
printf("=== TMA Descriptor Debug ===\n");
// Force CUDA context creation via runtime API
cudaFree(0); // This ensures a context is active
// Check driver API version
int driver_version = 0;
cuDriverGetVersion(&driver_version);
printf("CUDA driver version: %d\n", driver_version);
// Get current context
CUcontext ctx;
CUresult ctx_res = cuCtxGetCurrent(&ctx);
printf("cuCtxGetCurrent: %d, ctx=%p\n", (int)ctx_res, (void*)ctx);
// Allocate with cudaMalloc
bf16_t* d_data;
cudaError_t alloc_res = cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t));
printf("cudaMalloc: %s (err=%d)\n", cudaGetErrorString(alloc_res), (int)alloc_res);
// Try cuTensorMapEncodeTiled
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {1, 16};
uint32_t tdim[] = {16, 128};
uint32_t tstr[] = {1, 16};
CUtensorMap desc;
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("cuTensorMapEncodeTiled (128x16): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
// Try with cuMemAlloc instead
CUdeviceptr cu_data;
CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t));
printf("cuMemAlloc: %s (err=%d)\n", cu_alloc==CUDA_SUCCESS?"OK":"FAIL", (int)cu_alloc);
if (cu_alloc == CUDA_SUCCESS) {
res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data,
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("cuTensorMapEncodeTiled with cuMemAlloc ptr: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
cuMemFree(cu_data);
}
// Check if the pointer is 128B aligned
printf("d_data ptr: %p, 128B aligned: %s\n", (void*)d_data, ((uintptr_t)d_data % 128 == 0) ? "yes" : "no");
// Check the actual error description
const char* err_str = nullptr;
cuGetErrorString(res, &err_str);
printf("Error string: %s\n", err_str ? err_str : "unknown");
cudaFree(d_data);
printf("Done.\n");
return 0;
}