fix: epilogue warp self-signals acc_pipe producer before consuming

This commit is contained in:
2026-05-22 10:11:55 +00:00
parent 19b742f365
commit ea687980af

View File

@@ -410,6 +410,11 @@ class FmhaV3StageC2:
tmem.wait_for_alloc()
tmem_ptr = tmem.retrieve_ptr(self.qk_acc_dtype)
epi_handle = corr_epi_cons.wait_and_advance()
# Signal acc_pipe that O is ready (correction already normalized in TMEM)
acc_prod_st = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, 1)
acc_pipe.producer_acquire(acc_prod_st)
acc_pipe.producer_commit(acc_prod_st); acc_prod_st.advance()
acc_pipe.producer_tail(acc_prod_st)
# Write O from TMEM to GMEM via epilogue_tma_store
tCtO_base = cute.make_tensor(tmem_ptr + self.tmem_o0_offset, tCtO_fake.layout)
acc_cons_st = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, 1)