fix: use tcgen05.wait::st/ld instead of nonexistent tcgen05.fence
ROOT CAUSE of TMET hang: tcgen05.fence.cta_group::1.sync.aligned is NOT a valid PTX instruction. The correct TMEM ordering primitives are: - tcgen05.wait::st.sync.aligned (wait for TMEM stores to complete) - tcgen05.wait::ld.sync.aligned (wait for TMEM loads to complete) Found in cutlass/arch/barrier.h fence_view_async_tmem_store/load.
This commit is contained in:
@@ -21,8 +21,12 @@ __device__ void tmem_dealloc(uint32_t tmem_ptr, int num_cols) {
|
||||
:: "r"(tmem_ptr), "r"(num_cols));
|
||||
}
|
||||
|
||||
__device__ void tmem_fence() {
|
||||
asm volatile("tcgen05.fence.cta_group::1.sync.aligned;" ::: "memory");
|
||||
__device__ void tmem_fence_store() {
|
||||
asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory");
|
||||
}
|
||||
|
||||
__device__ void tmem_fence_load() {
|
||||
asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory");
|
||||
}
|
||||
|
||||
__device__ void tmem_store(uint32_t col, uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3) {
|
||||
@@ -82,7 +86,7 @@ __global__ void test_tmem_store_load() {
|
||||
uint32_t col_addr = tmem_base + 0;
|
||||
tmem_store(col_addr, u0, u1, u2, u3);
|
||||
}
|
||||
tmem_fence();
|
||||
tmem_fence_store();
|
||||
__syncthreads();
|
||||
|
||||
// Read back
|
||||
|
||||
Reference in New Issue
Block a user