Files
nvfp4-megamoe-kernel/tests/unit/test_tma_align.cu
2026-05-28 17:00:20 +00:00

35 lines
1.4 KiB
Plaintext

#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>
typedef unsigned short bf16_t;
int main() {
printf("=== TMA alignment + BF16 test ===\n");
bf16_t* d_data;
cudaMalloc(&d_data, 128*16*2 + 256);
uint64_t aligned_addr = ((uint64_t)d_data + 255) & ~255ULL;
bf16_t* d_aligned = (bf16_t*)aligned_addr;
printf("ptr: %p (aligned: %s)\n", d_aligned, (aligned_addr % 256 == 0) ? "YES" : "NO");
// Test with 256B-aligned pointer and various configs
uint64_t gdim[] = {16, 128};
uint64_t gstr[] = {32};
uint32_t tdim[] = {16, 128};
uint32_t tstr[] = {1, 16};
CUtensorMap tma;
CUresult r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("BF16 2D aligned: result=%d\n", r);
r = cuTensorMapEncodeTiled(&tma, CU_TENSOR_MAP_DATA_TYPE_UINT16, 2, d_aligned, gdim, gstr, tdim, tstr, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
printf("UINT16 2D aligned: result=%d\n", r);
// Check CUDA error
cudaError_t err = cudaGetLastError();
printf("CUDA last error: %s\n", cudaGetErrorString(err));
cudaFree(d_data);
return 0;
}