#include #include #include typedef unsigned short bf16_t; int main() { printf("=== TMA alignment + BF16 test ===\n"); bf16_t* d_data; cudaMalloc(&d_data, 128*16*2 + 256); uint64_t aligned_addr = ((uint64_t)d_data + 255) & ~255ULL; bf16_t* d_aligned = (bf16_t*)aligned_addr; printf("ptr: %p (aligned: %s)\n", d_aligned, (aligned_addr % 256 == 0) ? "YES" : "NO"); // Test with 256B-aligned pointer and various configs uint64_t gdim[] = {16, 128}; uint64_t gstr[] = {32}; uint32_t tdim[] = {16, 128}; uint32_t tstr[] = {1, 16}; CUtensorMap tma; CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("BF16 2D aligned: result=%d\n", r); r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); printf("UINT16 2D aligned: result=%d\n", r); // Check CUDA error cudaError_t err = cudaGetLastError(); printf("CUDA last error: %s\n", cudaGetErrorString(err)); cudaFree(d_data); return 0; }