nvfp4-megamoe-kernel/tests/unit/test_tma_desc_debug3.cu

/**
 * Debug TMA descriptor creation — fix context issues.
 */

#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>
#include <cstring>

typedef unsigned short bf16_t;

int main() {
    printf("=== TMA Descriptor Debug (context fix) ===\n");

    // Method 1: Use runtime API to create context, then get it for driver API
    cudaFree(0);

    // Check if driver API can see the runtime context
    CUcontext rt_ctx = nullptr;
    CUresult r1 = cuCtxGetCurrent(&rt_ctx);
    printf("cuCtxGetCurrent after cudaFree(0): err=%d, ctx=%p\n", (int)r1, (void*)rt_ctx);

    // Method 2: Create a primary context via driver API
    CUdevice device;
    cuDeviceGet(&device, 0);
    CUcontext primary_ctx;
    CUresult r2 = cuDevicePrimaryCtxRetain(&primary_ctx, device);
    printf("cuDevicePrimaryCtxRetain: err=%d, ctx=%p\n", (int)r2, (void*)primary_ctx);

    // Set the primary context current
    CUresult r3 = cuCtxSetCurrent(primary_ctx);
    printf("cuCtxSetCurrent: err=%d\n", (int)r3);

    // Now try cuTensorMapEncodeTiled
    bf16_t* d_data;
    cudaMalloc(&d_data, 128 * 16 * sizeof(bf16_t));
    printf("cudaMalloc: %s\n", cudaGetErrorString(cudaGetLastError()));

    uint64_t gdim[] = {16, 128};
    uint64_t gstr[] = {1, 16};
    uint32_t tdim[] = {16, 128};
    uint32_t tstr[] = {1, 16};
    CUtensorMap desc;

    printf("\n--- Test with UINT16 ---\n");
    CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
        gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
    printf("cuTensorMapEncodeTiled UINT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    if (res != CUDA_SUCCESS) {
        const char* err_str = nullptr;
        cuGetErrorString(res, &err_str);
        printf("Error: %s\n", err_str ? err_str : "unknown");
    }

    printf("\n--- Test with BFLOAT16 ---\n");
    res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data,
        gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
    printf("cuTensorMapEncodeTiled BFLOAT16: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    if (res != CUDA_SUCCESS) {
        const char* err_str = nullptr;
        cuGetErrorString(res, &err_str);
        printf("Error: %s\n", err_str ? err_str : "unknown");
    }

    // Try with cuMemAlloc instead of cudaMalloc
    printf("\n--- Test with cuMemAlloc ---\n");
    CUdeviceptr cu_data;
    CUresult cu_alloc = cuMemAlloc(&cu_data, 128 * 16 * sizeof(bf16_t));
    printf("cuMemAlloc: err=%d\n", (int)cu_alloc);
    if (cu_alloc == CUDA_SUCCESS) {
        res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, (void*)cu_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("cuTensorMapEncodeTiled with cuMemAlloc: %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
        if (res != CUDA_SUCCESS) {
            const char* err_str = nullptr;
            cuGetErrorString(res, &err_str);
            printf("Error: %s\n", err_str ? err_str : "unknown");
        }
        cuMemFree(cu_data);
    }

    cudaFree(d_data);
    cuDevicePrimaryCtxRelease(device);
    printf("Done.\n");
    return 0;
}