diff --git a/tests/unit/test_fmha_v3_stage_c_full.py b/tests/unit/test_fmha_v3_stage_c_full.py index b0c3a3f3..235b44a9 100644 --- a/tests/unit/test_fmha_v3_stage_c_full.py +++ b/tests/unit/test_fmha_v3_stage_c_full.py @@ -38,6 +38,7 @@ class FmhaV3StageC: def __init__(self, s_k=128, scale_softmax=None): # s_k MUST equal actual sequence length n (compile-time constant for V layout). self.s_k = s_k + self.n_kv_tiles = s_k // 128 # Python int — needed for range() unrolling self.acc_dtype = Float32; self.qk_acc_dtype = Float32 self.q_dtype = BFloat16; self.o_dtype = BFloat16; self.c_dtype = BFloat16 self.use_2cta_instrs = False; self.epilog_sync_bar_id = 1 @@ -207,7 +208,7 @@ class FmhaV3StageC: # Python range() unrolls at trace time. Each iteration emits a # separate cute.copy with a distinct compile-time Int32 constant. # We proved Int32(1) hardcoded works — by induction Int32(k) works. - for kt in range(n_kv_tiles): + for kt in range(self.n_kv_tiles): coord = Int32(kt) kvh = kvp.acquire_and_advance(pk) cute.copy(tma_k, tBgK[(None, coord)], tBsK[(None, kvh.index)], tma_bar_ptr=kvh.barrier) @@ -224,7 +225,7 @@ class FmhaV3StageC: kvc.reset(); pk = kvc.try_wait() acc_st = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_acc_stage) acc_pipe.producer_acquire(acc_st) - for kt in range(n_kv_tiles): + for kt in range(self.n_kv_tiles): kvh = kvc.wait_and_advance(pk); pk = cutlass.Boolean(1) sh = s_prod.acquire_and_advance() qk_mma.set(tcgen05.Field.ACCUMULATE, False) @@ -302,7 +303,7 @@ class FmhaV3StageC: row_sum = Float32(0.0) scale_log2 = Float32(self.scale_softmax_log2) - for kt in range(n_kv_tiles): + for kt in range(self.n_kv_tiles): si_handle = s_cons.wait_and_advance() # Load S[kt]