add fence.sc.gpu between PV and epilogue for TMEM visibility

This commit is contained in:
2026-05-28 23:21:53 +00:00
parent 52809b0ec6
commit 55c0604a71

View File

@@ -259,6 +259,10 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) {
}
}
// Ensure PV output is visible to all warps before epilogue
asm volatile("fence.sc.gpu;" ::: "memory");
__syncthreads();
// ================================================================
// EPILOGUE: TMEM → regs → normalize → BF16 → GMEM
// ================================================================