diff --git a/tests/unit/test_fmha_v3_stage_c.py b/tests/unit/test_fmha_v3_stage_c.py index 4c79cab9..02fbacb3 100644 --- a/tests/unit/test_fmha_v3_stage_c.py +++ b/tests/unit/test_fmha_v3_stage_c.py @@ -148,6 +148,7 @@ class FmhaV3StageCMulti: s_bar: cute.struct.MemRange[cutlass.Int64, 2] acc_bar: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage*2] tmem_dealloc: cutlass.Int64; holding: cutlass.Int32 + kv_coord_smem: cute.struct.MemRange[Int32, 1] smem = utils.SmemAllocator(); st = smem.allocate(SS) qp,qc = pipeline.PipelineTmaUmma.create(barrier_storage=st.q_bar.data_ptr(),num_stages=self.q_stage,producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread,1),tx_count=self.q_tx_bytes,cta_layout_vmnk=cl_vmnk,defer_sync=True).make_participants() @@ -224,11 +225,15 @@ class FmhaV3StageCMulti: cute.copy(tma_q, tAgQ[(None, Int32(0))], tAsQ[(None, qh.index)], tma_bar_ptr=qh.barrier) qp.tail() kvp.reset() - # DEBUG: Use constant Int32(1) to test if TMA can read from tile 1 at all + # Use SMEM-backed counter — the JIT can't constant-fold SMEM reads. + # Initialize to 0, read before each copy, increment after. + st.kv_coord_smem[0] = Int32(0) for kt in range(n_kv_tiles): + kv_coord = st.kv_coord_smem[0] # Dynamic read from SMEM kvh = kvp.acquire_and_advance() - cute.copy(tma_k, tBgK[(None, Int32(1))], tBsK[(None, kvh.index)], tma_bar_ptr=kvh.barrier) - cute.copy(tma_v, tVgV[(None, Int32(1))], tVsV[(None, kvh.index)], tma_bar_ptr=kvh.barrier) + cute.copy(tma_k, tBgK[(None, kv_coord)], tBsK[(None, kvh.index)], tma_bar_ptr=kvh.barrier) + cute.copy(tma_v, tVgV[(None, kv_coord)], tVsV[(None, kvh.index)], tma_bar_ptr=kvh.barrier) + st.kv_coord_smem[0] = kv_coord + 1 # Write back to SMEM kvp.tail() # ===== MMA warp =====