10-warp debug: MMA=warp4 TMA=warp5 idle=6-9 still gives cosine 0.29

Pipeline init uses __syncthreads (all 320 threads participate). Pipeline groups match 6-warp exactly. Only difference: threads_per_cta=320 vs 192. Direct comparison: 6-warp output [15,-129,-77.5,65,59] vs 10-warp output [-7.5,2.2,-22.7,7.3,12.0] for row 0. Completely different values. Something in CuTe DSL runtime uses blockDim.x or total CTA size in a way that breaks computation when CTA size changes from 192 to 320. The pipeline_init_wait calls agent_sync(ThreadBlock) = __syncthreads which all 320 threads reach. NamedBarriers use specific thread counts. TMA atoms are created from MMA thread layout, not CTA size. Hypothesis: the PipelineTmaUmma or PipelineUmmaAsync internally uses blockDim.x for barrier arithmetic, making the barriers expect more participants than the actual working threads.
2026-05-21 23:24:44 +00:00
parent 5a63604d6a
commit e58517b221
1 changed files with 5 additions and 5 deletions
--- a/tests/unit/test_fmha_v3_tenwarp.py
+++ b/tests/unit/test_fmha_v3_tenwarp.py
@@ -23,7 +23,7 @@ class FmhaV3TenWarp:
        self.use_2cta_instrs = False; self.epilog_sync_bar_id = 1
        self.cluster_shape_mn = (1, 1); self.cta_group = tcgen05.CtaGroup.ONE
        self.epilogue_warp_id = (0,1,2,3)
-        self.mma_warp_id = 8; self.tma_warp_id = 9
+        self.mma_warp_id = 4; self.tma_warp_id =5  # Same as 6-warp! Idle warps are 6-9
        self.threads_per_cta = 320; self.num_c_stage = 2
        self.kv_stage = 2; self.q_stage = 1; self.num_c_stage = 2

@@ -98,11 +98,11 @@ class FmhaV3TenWarp:
            tmem_dealloc: cutlass.Int64; holding: cutlass.Int32
        smem = utils.SmemAllocator(); st = smem.allocate(SS)

-        qp,qc = pipeline.PipelineTmaUmma.create(barrier_storage=st.q_bar.data_ptr(),num_stages=self.q_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),tx_count=self.q_tx_bytes,cta_layout_vmnk=cl_vmnk,defer_sync=True).make_participants()
-        kvp,kvc = pipeline.PipelineTmaUmma.create(barrier_storage=st.kv_bar.data_ptr(),num_stages=self.kv_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),tx_count=self.kv_tx_bytes,cta_layout_vmnk=cl_vmnk,defer_sync=True).make_participants()
-        s_prod,s_cons = pipeline.PipelineUmmaAsync.create(barrier_storage=st.s_bar.data_ptr(),num_stages=1,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,32*len(self.epilogue_warp_id))).make_participants()
+        qp,qc = pipeline.PipelineTmaUmma.create(barrier_storage=st.q_bar.data_ptr(),num_stages=self.q_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),tx_count=self.q_tx_bytes,cta_layout_vmnk=cl_vmnk,defer_sync=True).make_participants()
+        kvp,kvc = pipeline.PipelineTmaUmma.create(barrier_storage=st.kv_bar.data_ptr(),num_stages=self.kv_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),tx_count=self.kv_tx_bytes,cta_layout_vmnk=cl_vmnk,defer_sync=True).make_participants()
+        s_prod,s_cons = pipeline.PipelineUmmaAsync.create(barrier_storage=st.s_bar.data_ptr(),num_stages=1,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,32*len(self.epilogue_warp_id))).make_participants()
        softmax_done_bar = pipeline.NamedBarrier(barrier_id=3, num_threads=32 + 32*len(self.epilogue_warp_id))
-        acc_pipe = pipeline.PipelineUmmaAsync.create(barrier_storage=st.acc_bar.data_ptr(),num_stages=self.num_acc_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,32*len(self.epilogue_warp_id)),cta_layout_vmnk=cl_vmnk,defer_sync=True)
+        acc_pipe = pipeline.PipelineUmmaAsync.create(barrier_storage=st.acc_bar.data_ptr(),num_stages=self.num_acc_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,len(self.epilogue_warp_id)),cta_layout_vmnk=cl_vmnk,defer_sync=True)
        tmem_bar = pipeline.NamedBarrier(barrier_id=2,num_threads=32*len((self.mma_warp_id,*self.epilogue_warp_id)))
        tmem = utils.TmemAllocator(st.holding.ptr,barrier_for_retrieve=tmem_bar,allocator_warp_id=self.epilogue_warp_id[0],is_two_cta=cute.size(qk_mma.thr_id.shape)==2,two_cta_tmem_dealloc_mbar_ptr=st.tmem_dealloc.ptr)
        pipeline.pipeline_init_arrive(cluster_shape_mn=cl_vmnk,is_relaxed=True)