diff --git a/tests/unit/test_umma_qk.cu b/tests/unit/test_umma_qk.cu index e014e53c..f8337ec8 100644 --- a/tests/unit/test_umma_qk.cu +++ b/tests/unit/test_umma_qk.cu @@ -71,14 +71,25 @@ test_umma_qk_hd16( __syncthreads(); uint32_t tmem_base = *sTmemBase; - // Zero TMEM — skip for now - // if (wid == 0) { - // for (int col = 0; col < 32; col++) { - // tmem_store(tmem_base + col, 0, 0, 0, 0); - // } - // tmem_fence_store(); - // } - // __syncthreads(); + // Debug: write tmem_base to output + if (tid == 0) { + s_out[138] = 999.0f; // sentinel before tmem ops + } + __syncthreads(); + + // Zero TMEM — re-enable with debug + if (wid == 0) { + for (int col = 0; col < 128; col++) { + tmem_store(tmem_base + col, 0, 0, 0, 0); + } + tmem_fence_store(); + } + __syncthreads(); + + if (tid == 0) { + s_out[138] = (float)tmem_base; // write tmem_base after tmem ops + } + __syncthreads(); // ================================================================ // Load Q and K into SMEM in canonical layout