Try full tensor TMA copy without slicing

2026-05-27 05:28:43 +00:00
parent 7d14a2f764
commit 2af767a90c
1 changed files with 1 additions and 1 deletions
--- a/dsv4/kernels/attention/fmha_smem_acc.py
+++ b/dsv4/kernels/attention/fmha_smem_acc.py
@@ -634,7 +634,7 @@ class FmhaKernel:
                    cute.group_modes(gO, 0, 2),
                )
                if warp_idx == self.epilogue_warp_id[0]:
-                    cute.copy(tma_c, tOsC[None, Int32(0)], tOgO[None, Int32(0)])
+                    cute.copy(tma_c, tOsC, tOgO)
                    cute.arch.cp_async_bulk_commit_group()
                    cute.arch.cp_async_bulk_wait_group(0, read=True)