add fence.sc.gpu between PV and epilogue for TMEM visibility
This commit is contained in:
@@ -259,6 +259,10 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) {
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure PV output is visible to all warps before epilogue
|
||||
asm volatile("fence.sc.gpu;" ::: "memory");
|
||||
__syncthreads();
|
||||
|
||||
// ================================================================
|
||||
// EPILOGUE: TMEM → regs → normalize → BF16 → GMEM
|
||||
// ================================================================
|
||||
|
||||
Reference in New Issue
Block a user