fix: align TMA SMEM to 128 bytes in verification test

This commit is contained in:
2026-05-29 18:27:07 +00:00
parent befc2c647b
commit bf7cf54a51

View File

@@ -30,9 +30,11 @@ __global__ void test_tma_load_kernel(
bf16_t* __restrict__ result_direct, // output: canonical layout from direct path
CUtensorMap* __restrict__ tma_desc
) {
extern __shared__ char sbuf[];
bf16_t* sData_tma = (bf16_t*)(sbuf + 0); // TMA destination (row-major)
bf16_t* sData_canonical = (bf16_t*)(sbuf + ROWS * COLS * 2); // canonical output
extern __shared__ __align__(128) char sbuf[];
size_t off = 0;
bf16_t* sData_tma = (bf16_t*)(sbuf + off); off += ROWS * COLS * sizeof(bf16_t);
off = (off + 127) & ~(size_t)127; // 128-byte align for TMA
bf16_t* sData_canonical = (bf16_t*)(sbuf + off);
const int tid = threadIdx.x;
const int lane = tid % 32;