D1.5: Fix TMA store - group_modes on bSG_gC, use flat indexing

This commit is contained in:
2026-05-24 01:36:01 +00:00
parent 61c4e107da
commit ec250eccd6

View File

@@ -500,9 +500,11 @@ class FmhaKernel:
cute.group_modes(sC, 0, 2),
cute.group_modes(tCgC_epi, 0, 2),
)
# Group all modes >= 1 into one (CUTLASS pattern)
bSG_gC_flat = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
# One TMA store for the full output tile
if warp_idx == self.epilogue_warp_id[0]:
cute.copy(tma_c, bSG_sC[(None, 0)], bSG_gC[(None, None, None, Int32(0), Int32(0), Int32(0))])
cute.copy(tma_c, bSG_sC[(None, 0)], bSG_gC_flat[(None, Int32(0))])
cute.arch.cp_async_bulk_commit_group()
cute.arch.cp_async_bulk_wait_group(0, read=True)