fix: use tcgen05.wait::st/ld instead of nonexistent tcgen05.fence

ROOT CAUSE of TMET hang: tcgen05.fence.cta_group::1.sync.aligned is
NOT a valid PTX instruction. The correct TMEM ordering primitives are:
- tcgen05.wait::st.sync.aligned (wait for TMEM stores to complete)
- tcgen05.wait::ld.sync.aligned (wait for TMEM loads to complete)

Found in cutlass/arch/barrier.h fence_view_async_tmem_store/load.
This commit is contained in:
2026-05-28 07:12:26 +00:00
parent ba1e81f2dc
commit bd16e8fa85

View File

@@ -21,8 +21,12 @@ __device__ void tmem_dealloc(uint32_t tmem_ptr, int num_cols) {
:: "r"(tmem_ptr), "r"(num_cols));
}
__device__ void tmem_fence() {
asm volatile("tcgen05.fence.cta_group::1.sync.aligned;" ::: "memory");
__device__ void tmem_fence_store() {
asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory");
}
__device__ void tmem_fence_load() {
asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory");
}
__device__ void tmem_store(uint32_t col, uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3) {
@@ -82,7 +86,7 @@ __global__ void test_tmem_store_load() {
uint32_t col_addr = tmem_base + 0;
tmem_store(col_addr, u0, u1, u2, u3);
}
tmem_fence();
tmem_fence_store();
__syncthreads();
// Read back