diff --git a/tests/unit/test_tma_verify.cu b/tests/unit/test_tma_verify.cu index 38dc2855..1ac23bc9 100644 --- a/tests/unit/test_tma_verify.cu +++ b/tests/unit/test_tma_verify.cu @@ -30,9 +30,11 @@ __global__ void test_tma_load_kernel( bf16_t* __restrict__ result_direct, // output: canonical layout from direct path CUtensorMap* __restrict__ tma_desc ) { - extern __shared__ char sbuf[]; - bf16_t* sData_tma = (bf16_t*)(sbuf + 0); // TMA destination (row-major) - bf16_t* sData_canonical = (bf16_t*)(sbuf + ROWS * COLS * 2); // canonical output + extern __shared__ __align__(128) char sbuf[]; + size_t off = 0; + bf16_t* sData_tma = (bf16_t*)(sbuf + off); off += ROWS * COLS * sizeof(bf16_t); + off = (off + 127) & ~(size_t)127; // 128-byte align for TMA + bf16_t* sData_canonical = (bf16_t*)(sbuf + off); const int tid = threadIdx.x; const int lane = tid % 32;