From 55c0604a71ebe0f2c867701ae6780e163c8d923d Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 23:21:53 +0000 Subject: [PATCH] add fence.sc.gpu between PV and epilogue for TMEM visibility --- dsv4/kernels/attention/fmha_6warp_multirow.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dsv4/kernels/attention/fmha_6warp_multirow.cuh b/dsv4/kernels/attention/fmha_6warp_multirow.cuh index 388bff9d..cfa7891a 100644 --- a/dsv4/kernels/attention/fmha_6warp_multirow.cuh +++ b/dsv4/kernels/attention/fmha_6warp_multirow.cuh @@ -259,6 +259,10 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) { } } + // Ensure PV output is visible to all warps before epilogue + asm volatile("fence.sc.gpu;" ::: "memory"); + __syncthreads(); + // ================================================================ // EPILOGUE: TMEM → regs → normalize → BF16 → GMEM // ================================================================