Fix: use NamedBarrier instead of mbarrier_arrive/wait
This commit is contained in:
@@ -435,14 +435,12 @@ class FmhaV3StageCMulti:
|
||||
cute.arch.fence_view_async_tmem_load()
|
||||
# Async-proxy fence so the TMA store sees the SMEM writes.
|
||||
cute.arch.fence_proxy("async.shared", space="cta")
|
||||
cute.arch.mbarrier_arrive(
|
||||
# Use NamedBarrier to sync softmax warps with TMA store warp
|
||||
epi_sync_bar = pipeline.NamedBarrier(
|
||||
barrier_id=self.epilog_sync_bar_id,
|
||||
number_of_threads=32 * len(self.epilogue_warp_id),
|
||||
)
|
||||
cute.arch.mbarrier_wait(
|
||||
barrier_id=self.epilog_sync_bar_id,
|
||||
number_of_threads=32 * len(self.epilogue_warp_id),
|
||||
num_threads=32 * len(self.epilogue_warp_id),
|
||||
)
|
||||
epi_sync_bar.arrive_and_wait()
|
||||
|
||||
# TMA SMEM -> GMEM. One warp issues the copy; the rest waited at
|
||||
# the named barrier above. (Match epilogue_tma_store's behavior
|
||||
|
||||
Reference in New Issue
Block a user