Fix: use NamedBarrier instead of mbarrier_arrive/wait

This commit is contained in:
2026-05-22 19:49:07 +00:00
parent 3bd406e925
commit f821dd00fe

View File

@@ -435,14 +435,12 @@ class FmhaV3StageCMulti:
cute.arch.fence_view_async_tmem_load()
# Async-proxy fence so the TMA store sees the SMEM writes.
cute.arch.fence_proxy("async.shared", space="cta")
cute.arch.mbarrier_arrive(
# Use NamedBarrier to sync softmax warps with TMA store warp
epi_sync_bar = pipeline.NamedBarrier(
barrier_id=self.epilog_sync_bar_id,
number_of_threads=32 * len(self.epilogue_warp_id),
)
cute.arch.mbarrier_wait(
barrier_id=self.epilog_sync_bar_id,
number_of_threads=32 * len(self.epilogue_warp_id),
num_threads=32 * len(self.epilogue_warp_id),
)
epi_sync_bar.arrive_and_wait()
# TMA SMEM -> GMEM. One warp issues the copy; the rest waited at
# the named barrier above. (Match epilogue_tma_store's behavior