129 lines
5.6 KiB
Plaintext
129 lines
5.6 KiB
Plaintext
/**
|
|
* Minimal TMA descriptor creation test.
|
|
* Tests various tile dimensions to find what cuTensorMapEncodeTiled accepts.
|
|
*/
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cuda.h>
|
|
#include <cstdio>
|
|
|
|
typedef unsigned short bf16_t;
|
|
|
|
int main() {
|
|
// Initialize CUDA driver API — REQUIRED before any cuTensorMap calls
|
|
cuInit(0);
|
|
|
|
printf("=== TMA Descriptor Creation Test ===\n");
|
|
|
|
bf16_t* d_data;
|
|
cudaMalloc(&d_data, 128 * 64 * sizeof(bf16_t)); // (128, 64) BF16
|
|
|
|
// Test 1: (128, 16) global, (16, 128) tile — same as working test
|
|
{
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {1, 16};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 1 (128x16 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 2: (64, 128) global, (16, 128) tile, tstr=[1,64]
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 64};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 2 (128x64 global, 16x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 3: (64, 128) global, (16, 128) tile, tstr=[1,16] (tile-local stride)
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 3 (128x64 global, 16x128 tile, tstr=[1,16]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 4: (64, 128) global, (16, 128) tile — try element type UINT8
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 64};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT8, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 4 (UINT8, 128x64, 16x128, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 5: Same as working test but with global (64, 128)
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {64, 128};
|
|
uint32_t tstr[] = {1, 64};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 5 (128x64 global, 64x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 6: (64, 128) global, (32, 128) tile (tile cols = 32)
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {32, 128};
|
|
uint32_t tstr[] = {1, 64};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 6 (128x64 global, 32x128 tile, tstr=[1,64]): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 7: (64, 128) global, (16, 128) tile, UINT16, tstr=[1,64], SWIZZLE_128B
|
|
{
|
|
uint64_t gdim[] = {64, 128};
|
|
uint64_t gstr[] = {1, 64};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 64};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 7 (128x64, 16x128, SW128): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
// Test 8: (16, 128) global, (16, 128) tile — EXACT working test dimensions
|
|
{
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {1, 16};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap desc;
|
|
CUresult res = cuTensorMapEncodeTiled(&desc, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data,
|
|
gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("Test 8 (128x16 global, 16x128 tile, tstr=[1,16] — exact working): %s (err=%d)\n", res==CUDA_SUCCESS?"OK":"FAIL", (int)res);
|
|
}
|
|
|
|
cudaFree(d_data);
|
|
printf("Done.\n");
|
|
return 0;
|
|
}
|