Try full tensor TMA copy without slicing
This commit is contained in:
@@ -634,7 +634,7 @@ class FmhaKernel:
|
||||
cute.group_modes(gO, 0, 2),
|
||||
)
|
||||
if warp_idx == self.epilogue_warp_id[0]:
|
||||
cute.copy(tma_c, tOsC[None, Int32(0)], tOgO[None, Int32(0)])
|
||||
cute.copy(tma_c, tOsC, tOgO)
|
||||
cute.arch.cp_async_bulk_commit_group()
|
||||
cute.arch.cp_async_bulk_wait_group(0, read=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user