35 lines
1.4 KiB
Plaintext
35 lines
1.4 KiB
Plaintext
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#include <cstdio>
|
|
|
|
typedef unsigned short bf16_t;
|
|
|
|
int main() {
|
|
printf("=== TMA alignment + BF16 test ===\n");
|
|
bf16_t* d_data;
|
|
cudaMalloc(&d_data, 128*16*2 + 256);
|
|
uint64_t aligned_addr = ((uint64_t)d_data + 255) & ~255ULL;
|
|
bf16_t* d_aligned = (bf16_t*)aligned_addr;
|
|
printf("ptr: %p (aligned: %s)\n", d_aligned, (aligned_addr % 256 == 0) ? "YES" : "NO");
|
|
|
|
// Test with 256B-aligned pointer and various configs
|
|
uint64_t gdim[] = {16, 128};
|
|
uint64_t gstr[] = {32};
|
|
uint32_t tdim[] = {16, 128};
|
|
uint32_t tstr[] = {1, 16};
|
|
CUtensorMap tma;
|
|
|
|
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("BF16 2D aligned: result=%d\n", r);
|
|
|
|
r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
printf("UINT16 2D aligned: result=%d\n", r);
|
|
|
|
// Check CUDA error
|
|
cudaError_t err = cudaGetLastError();
|
|
printf("CUDA last error: %s\n", cudaGetErrorString(err));
|
|
|
|
cudaFree(d_data);
|
|
return 0;
|
|
}
|