diff --git a/dsv4/kernels/attention/fmha_6warp_multirow.cuh b/dsv4/kernels/attention/fmha_6warp_multirow.cuh index 388bff9d..cfa7891a 100644 --- a/dsv4/kernels/attention/fmha_6warp_multirow.cuh +++ b/dsv4/kernels/attention/fmha_6warp_multirow.cuh @@ -259,6 +259,10 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) { } } + // Ensure PV output is visible to all warps before epilogue + asm volatile("fence.sc.gpu;" ::: "memory"); + __syncthreads(); + // ================================================================ // EPILOGUE: TMEM → regs → normalize → BF16 → GMEM // ================================================================