diff --git a/tests/fmha_v3_stage_c_example7.py b/tests/fmha_v3_stage_c_example7.py index d8b98568..b72e8318 100644 --- a/tests/fmha_v3_stage_c_example7.py +++ b/tests/fmha_v3_stage_c_example7.py @@ -435,14 +435,12 @@ class FmhaV3StageCMulti: cute.arch.fence_view_async_tmem_load() # Async-proxy fence so the TMA store sees the SMEM writes. cute.arch.fence_proxy("async.shared", space="cta") - cute.arch.mbarrier_arrive( + # Use NamedBarrier to sync softmax warps with TMA store warp + epi_sync_bar = pipeline.NamedBarrier( barrier_id=self.epilog_sync_bar_id, - number_of_threads=32 * len(self.epilogue_warp_id), - ) - cute.arch.mbarrier_wait( - barrier_id=self.epilog_sync_bar_id, - number_of_threads=32 * len(self.epilogue_warp_id), + num_threads=32 * len(self.epilogue_warp_id), ) + epi_sync_bar.arrive_and_wait() # TMA SMEM -> GMEM. One warp issues the copy; the rest waited at # the named barrier above. (Match epilogue_tma_store's behavior