Files
nvfp4-megamoe-kernel/tests/unit/test_tma_debug.cu

61 lines
2.4 KiB
Plaintext

#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>
typedef unsigned short bf16_t;
int main() {
printf("=== CUtensorMap parameter debug (fixed strides) ===\n");
bf16_t* d_data;
cudaMalloc(&d_data, 128*16*2);
// globalStrides has tensorRank-1 elements (innermost stride is implicit)
// Test 1: BF16 2D, gstr=[row_stride_bytes]
{
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {32}; // 1 stride: row stride = 16 cols * 2 bytes
uint32_t tdim[] = {16, 128};
uint32_t tstr[] = {1, 16};
CUtensorMap tma;
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("BF16 (16,128) gstr=[32]: result=%d\n", r);
}
// Test 2: UINT16 2D
{
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {32};
uint32_t tdim[] = {16, 128};
uint32_t tstr[] = {1, 16};
CUtensorMap tma;
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("UINT16 (16,128) gstr=[32]: result=%d\n", r);
}
// Test 3: Smaller tile (16, 8)
{
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {32};
uint32_t tdim[] = {16, 8};
uint32_t tstr[] = {1, 16};
CUtensorMap tma;
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("BF16 tile(16,8): result=%d\n", r);
}
// Test 4: 1D BF16 (as byte array)
{
uint64_t gdim[] = {4096};
uint64_t gstr[] = {2};
uint32_t tdim[] = {256};
uint32_t tstr[] = {1};
CUtensorMap tma;
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT8, 1, d_data, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("UINT8 1D: result=%d\n", r);
}
cudaFree(d_data);
return 0;
}