Try full tensor TMA copy without slicing

This commit is contained in:
2026-05-27 05:28:43 +00:00
parent 7d14a2f764
commit 2af767a90c

View File

@@ -634,7 +634,7 @@ class FmhaKernel:
cute.group_modes(gO, 0, 2),
)
if warp_idx == self.epilogue_warp_id[0]:
cute.copy(tma_c, tOsC[None, Int32(0)], tOgO[None, Int32(0)])
cute.copy(tma_c, tOsC, tOgO)
cute.arch.cp_async_bulk_commit_group()
cute.arch.cp_async_bulk_wait_group(0, read=True)