diff --git a/tests/unit/test_umma_qk.cu b/tests/unit/test_umma_qk.cu index 556beb0e..903243d7 100644 --- a/tests/unit/test_umma_qk.cu +++ b/tests/unit/test_umma_qk.cu @@ -71,9 +71,10 @@ test_umma_qk_hd16( __syncthreads(); uint32_t tmem_base = *sTmemBase; - // Zero TMEM + // Zero TMEM — test: only zero first 32 columns (min power of 2) + // Note: 128 columns might be too many for tmem_store in a loop if (wid == 0) { - for (int col = 0; col < 128; col++) { + for (int col = 0; col < 32; col++) { tmem_store(tmem_base + col, 0, 0, 0, 0); } tmem_fence_store();