fix: use cute.copy instead of cpasync.copy for TMA store

This commit is contained in:
2026-05-23 02:23:16 +00:00
parent 702bf8aa29
commit 0ecde542f1

View File

@@ -397,7 +397,7 @@ class FmhaV3StageCMulti:
num_threads=32 * len(self.epilogue_warp_id),
)
epi_bar.arrive_and_wait()
cpasync.copy(tma_c, cute.select(sC, mode=[0, 1]), gC)
cute.copy(tma_c, cute.select(sC, mode=[0, 1]), gC)
cute.arch.cp_async_bulk_commit_group()
cute.arch.cp_async_bulk_wait_group(0, read=True)