D1.5: Use tCtO_fake layout for epilogue_tma_store (needs STAGE dim)

This commit is contained in:
2026-05-24 02:41:32 +00:00
parent dd7eae0c1c
commit 4e593f1cad

View File

@@ -438,7 +438,7 @@ class FmhaKernel:
cute.arch.fence_view_async_tmem_store()
# TMA store via CUTLASS epilogue_tma_store
tCtO_base = cute.make_tensor(tmem_ptr + self.tmem_o0_offset, tOtO.layout)
tCtO_base = cute.make_tensor(tmem_ptr + self.tmem_o0_offset, tCtO_fake.layout)
c_grp = pipeline.CooperativeGroup(pipeline.Agent.Thread, 32 * len(self.epilogue_warp_id))
c_pipe = pipeline.PipelineTmaStore.create(num_stages=self.num_c_stage, producer_group=c_grp)
acc_cons_st = pipeline.make_pipeline_state(