From f821dd00fedb68b7116cc0f0899673ab3bf412a8 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 22 May 2026 19:49:07 +0000 Subject: [PATCH] Fix: use NamedBarrier instead of mbarrier_arrive/wait --- tests/fmha_v3_stage_c_example7.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/fmha_v3_stage_c_example7.py b/tests/fmha_v3_stage_c_example7.py index d8b98568..b72e8318 100644 --- a/tests/fmha_v3_stage_c_example7.py +++ b/tests/fmha_v3_stage_c_example7.py @@ -435,14 +435,12 @@ class FmhaV3StageCMulti: cute.arch.fence_view_async_tmem_load() # Async-proxy fence so the TMA store sees the SMEM writes. cute.arch.fence_proxy("async.shared", space="cta") - cute.arch.mbarrier_arrive( + # Use NamedBarrier to sync softmax warps with TMA store warp + epi_sync_bar = pipeline.NamedBarrier( barrier_id=self.epilog_sync_bar_id, - number_of_threads=32 * len(self.epilogue_warp_id), - ) - cute.arch.mbarrier_wait( - barrier_id=self.epilog_sync_bar_id, - number_of_threads=32 * len(self.epilogue_warp_id), + num_threads=32 * len(self.epilogue_warp_id), ) + epi_sync_bar.arrive_and_wait() # TMA SMEM -> GMEM. One warp issues the copy; the rest waited at # the named barrier above. (Match epilogue_tma_store's behavior