/**
 * Minimal TMA descriptor creation test.
 * Tests various tile dimensions to find what cuTensorMapEncodeTiled accepts.
 */

#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>

typedef unsigned short bf16_t;

int main() {
    // Initialize CUDA driver API — REQUIRED before any cuTensorMap calls
    cuInit(0);

    printf("=== TMA Descriptor Creation Test ===\n");

    bf16_t* d_data;
    cudaMalloc(&d_data, 128 * 64 * sizeof(bf16_t));  // (128, 64) BF16

    // Test 1: (128, 16) global, (16, 128) tile — same as working test
    {
        uint64_t gdim[] = {16, 128};
        uint64_t gstr[] = {1, 16};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 16};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 1 (128x16 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 2: (64, 128) global, (16, 128) tile, tstr=[1,64]
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 64};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 2 (128x64 global, 16x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 3: (64, 128) global, (16, 128) tile, tstr=[1,16] (tile-local stride)
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 16};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 3 (128x64 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 4: (64, 128) global, (16, 128) tile — try element type UINT8
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 64};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT8, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 4 (UINT8, 128x64, 16x128, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 5: Same as working test but with global (64, 128)
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {64, 128};
        uint32_t tstr[] = {1, 64};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 5 (128x64 global, 64x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 6: (64, 128) global, (32, 128) tile (tile cols = 32)
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {32, 128};
        uint32_t tstr[] = {1, 64};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 6 (128x64 global, 32x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 7: (64, 128) global, (16, 128) tile, UINT16, tstr=[1,64], SWIZZLE_128B
    {
        uint64_t gdim[] = {64, 128};
        uint64_t gstr[] = {1, 64};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 64};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 7 (128x64, 16x128, SW128): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    // Test 8: (16, 128) global, (16, 128) tile — EXACT working test dimensions
    {
        uint64_t gdim[] = {16, 128};
        uint64_t gstr[] = {1, 16};
        uint32_t tdim[] = {16, 128};
        uint32_t tstr[] = {1, 16};
        CUtensorMap desc;
        CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
            gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
            CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
        printf("Test 8 (128x16 global, 16x128 tile, tstr=[1,16] — exact working): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
    }

    cudaFree(d_data);
    printf("Done.\n");
    return 0;
}