test: always accumulate, separate SMEM per K-tile, TMEM starts at 0

This commit is contained in:
2026-05-28 12:23:47 +00:00
parent 8707f555c2
commit 1bf76388c8

View File

@@ -67,14 +67,10 @@ test_umma_hd64(const bf16_t* q, const bf16_t* k,
uint64_t dk = make_umma_desc_kmajor_none(__cvta_generic_to_shared(sK), 128);
uint32_t idesc = make_idesc(128, 128);
// MMA
// MMA — always accumulate (TMEM starts at 0 after alloc)
if (lane == 0) {
umma_ss_f16(tb, dq, dk, idesc, kt > 0);
umma_ss_f16(tb, dq, dk, idesc, true); // Always accumulate
}
__syncwarp(); // Ensure MMA is issued
asm volatile("tcgen05.fence::after_thread_sync;" ::: "memory");
__syncthreads(); // Wait for all warps
__syncthreads(); // Extra barrier for safety
}
// Read TMEM