diff --git a/tests/unit/test_tmem_minimal.cu b/tests/unit/test_tmem_minimal.cu index ec7a9efe..f16616aa 100644 --- a/tests/unit/test_tmem_minimal.cu +++ b/tests/unit/test_tmem_minimal.cu @@ -21,8 +21,12 @@ __device__ void tmem_dealloc(uint32_t tmem_ptr, int num_cols) { :: "r"(tmem_ptr), "r"(num_cols)); } -__device__ void tmem_fence() { - asm volatile("tcgen05.fence.cta_group::1.sync.aligned;" ::: "memory"); +__device__ void tmem_fence_store() { + asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory"); +} + +__device__ void tmem_fence_load() { + asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory"); } __device__ void tmem_store(uint32_t col, uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3) { @@ -82,7 +86,7 @@ __global__ void test_tmem_store_load() { uint32_t col_addr = tmem_base + 0; tmem_store(col_addr, u0, u1, u2, u3); } - tmem_fence(); + tmem_fence_store(); __syncthreads(); // Read back