diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 4ce2b669..a2188329 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -500,9 +500,11 @@ class FmhaKernel: cute.group_modes(sC, 0, 2), cute.group_modes(tCgC_epi, 0, 2), ) + # Group all modes >= 1 into one (CUTLASS pattern) + bSG_gC_flat = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC)) # One TMA store for the full output tile if warp_idx == self.epilogue_warp_id[0]: - cute.copy(tma_c, bSG_sC[(None, 0)], bSG_gC[(None, None, None, Int32(0), Int32(0), Int32(0))]) + cute.copy(tma_c, bSG_sC[(None, 0)], bSG_gC_flat[(None, Int32(0))]) cute.arch.cp_async_bulk_commit_group() cute.arch.cp_async_bulk_wait_group(0, read=True)