fix: align TMA SMEM to 128 bytes in verification test

2026-05-29 18:27:07 +00:00
parent befc2c647b
commit bf7cf54a51
1 changed files with 5 additions and 3 deletions
--- a/tests/unit/test_tma_verify.cu
+++ b/tests/unit/test_tma_verify.cu
@@ -30,9 +30,11 @@ __global__ void test_tma_load_kernel(
    bf16_t* __restrict__ result_direct,     // output: canonical layout from direct path
    CUtensorMap* __restrict__ tma_desc
 ) {
-    extern __shared__ char sbuf[];
-    bf16_t* sData_tma = (bf16_t*)(sbuf + 0);        // TMA destination (row-major)
-    bf16_t* sData_canonical = (bf16_t*)(sbuf + ROWS * COLS * 2);  // canonical output
+    extern __shared__ __align__(128) char sbuf[];
+    size_t off = 0;
+    bf16_t* sData_tma = (bf16_t*)(sbuf + off); off += ROWS * COLS * sizeof(bf16_t);
+    off = (off + 127) & ~(size_t)127;  // 128-byte align for TMA
+    bf16_t* sData_canonical = (bf16_t*)(sbuf + off);

    const int tid = threadIdx.x;
    const int lane = tid % 32;