From bd16e8fa85549831b8c2fbaf0a4958909bd7f8bf Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 07:12:26 +0000 Subject: [PATCH] fix: use tcgen05.wait::st/ld instead of nonexistent tcgen05.fence ROOT CAUSE of TMET hang: tcgen05.fence.cta_group::1.sync.aligned is NOT a valid PTX instruction. The correct TMEM ordering primitives are: - tcgen05.wait::st.sync.aligned (wait for TMEM stores to complete) - tcgen05.wait::ld.sync.aligned (wait for TMEM loads to complete) Found in cutlass/arch/barrier.h fence_view_async_tmem_store/load. --- tests/unit/test_tmem_minimal.cu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_tmem_minimal.cu b/tests/unit/test_tmem_minimal.cu index ec7a9efe..f16616aa 100644 --- a/tests/unit/test_tmem_minimal.cu +++ b/tests/unit/test_tmem_minimal.cu @@ -21,8 +21,12 @@ __device__ void tmem_dealloc(uint32_t tmem_ptr, int num_cols) { :: "r"(tmem_ptr), "r"(num_cols)); } -__device__ void tmem_fence() { - asm volatile("tcgen05.fence.cta_group::1.sync.aligned;" ::: "memory"); +__device__ void tmem_fence_store() { + asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory"); +} + +__device__ void tmem_fence_load() { + asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory"); } __device__ void tmem_store(uint32_t col, uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3) { @@ -82,7 +86,7 @@ __global__ void test_tmem_store_load() { uint32_t col_addr = tmem_base + 0; tmem_store(col_addr, u0, u1, u2, u3); } - tmem_fence(); + tmem_fence_store(); __syncthreads(); // Read back