61 lines
2.4 KiB
Plaintext
61 lines
2.4 KiB
Plaintext
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#include <cstdio>
|
|
|
|
typedef unsigned short bf16_t;
|
|
|
|
int main() {
|
|
printf("=== CUtensorMap parameter debug (fixed strides) ===\n");
|
|
bf16_t* d_data;
|
|
cudaMalloc(&d_data, 128*16*2);
|
|
|
|
// globalStrides has tensorRank-1 elements (innermost stride is implicit)
|
|
|
|
// Test 1: BF16 2D, gstr=[row_stride_bytes]
|
|
{
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {32}; // 1 stride: row stride = 16 cols * 2 bytes
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap tma;
|
|
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("BF16 (16,128) gstr=[32]: result=%d\n", r);
|
|
}
|
|
|
|
// Test 2: UINT16 2D
|
|
{
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {32};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap tma;
|
|
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("UINT16 (16,128) gstr=[32]: result=%d\n", r);
|
|
}
|
|
|
|
// Test 3: Smaller tile (16, 8)
|
|
{
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {32};
|
|
uint32_t tdim[] = {16, 8};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap tma;
|
|
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("BF16 tile(16,8): result=%d\n", r);
|
|
}
|
|
|
|
// Test 4: 1D BF16 (as byte array)
|
|
{
|
|
uint64_t gdim[] = {4096};
|
|
uint64_t gstr[] = {2};
|
|
uint32_t tdim[] = {256};
|
|
uint32_t tstr[] = {1};
|
|
CUtensorMap tma;
|
|
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT8, 1, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("UINT8 1D: result=%d\n", r);
|
|
}
|
|
|
|
cudaFree(d_data);
|
|
return 0;
|
|
}
|