From 1bf76388c8fe2f2bae7a12fe88ae4353ba95fc0c Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 12:23:47 +0000 Subject: [PATCH] test: always accumulate, separate SMEM per K-tile, TMEM starts at 0 --- tests/unit/test_umma_qk_hd64.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_umma_qk_hd64.cu b/tests/unit/test_umma_qk_hd64.cu index 580a9da7..c4ba6ed1 100644 --- a/tests/unit/test_umma_qk_hd64.cu +++ b/tests/unit/test_umma_qk_hd64.cu @@ -67,14 +67,10 @@ test_umma_hd64(const bf16_t* q, const bf16_t* k, uint64_t dk = make_umma_desc_kmajor_none(__cvta_generic_to_shared(sK), 128); uint32_t idesc = make_idesc(128, 128); - // MMA + // MMA — always accumulate (TMEM starts at 0 after alloc) if (lane == 0) { - umma_ss_f16(tb, dq, dk, idesc, kt > 0); + umma_ss_f16(tb, dq, dk, idesc, true); // Always accumulate } - __syncwarp(); // Ensure MMA is issued - asm volatile("tcgen05.fence::after_thread_sync;" ::: "memory"); - __syncthreads(); // Wait for all warps - __syncthreads(); // Extra barrier for safety } // Read TMEM