fix: align TMA SMEM to 128 bytes in verification test
This commit is contained in:
@@ -30,9 +30,11 @@ __global__ void test_tma_load_kernel(
|
||||
bf16_t* __restrict__ result_direct, // output: canonical layout from direct path
|
||||
CUtensorMap* __restrict__ tma_desc
|
||||
) {
|
||||
extern __shared__ char sbuf[];
|
||||
bf16_t* sData_tma = (bf16_t*)(sbuf + 0); // TMA destination (row-major)
|
||||
bf16_t* sData_canonical = (bf16_t*)(sbuf + ROWS * COLS * 2); // canonical output
|
||||
extern __shared__ __align__(128) char sbuf[];
|
||||
size_t off = 0;
|
||||
bf16_t* sData_tma = (bf16_t*)(sbuf + off); off += ROWS * COLS * sizeof(bf16_t);
|
||||
off = (off + 127) & ~(size_t)127; // 128-byte align for TMA
|
||||
bf16_t* sData_canonical = (bf16_t*)(sbuf + off);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int lane = tid % 32;
|
||||
|
||||
Reference in New Issue
Block a user