diff --git a/tests/fmha_v3_stage_c_example9.py b/tests/fmha_v3_stage_c_example9.py index d2f8a54b..ba1e6fa2 100644 --- a/tests/fmha_v3_stage_c_example9.py +++ b/tests/fmha_v3_stage_c_example9.py @@ -323,6 +323,7 @@ class FmhaV3StageCMulti: # Per-tile softmax loop with online rescale. for kt in range(n_kv_tiles): si_handle = s_cons.wait_and_advance() + cute.printf("SOFTMAX kt=%d row_max_before=%f row_sum_before=%f\n", kt, row_max, row_sum) # Load S[kt] tTMEM_LOADrS = cute.make_rmem_tensor(tTMEM_LOADcS.shape, self.qk_acc_dtype) @@ -397,6 +398,7 @@ class FmhaV3StageCMulti: cute.arch.fence_view_async_tmem_store() si_handle.release() + cute.printf("SOFTMAX kt=%d row_max_after=%f row_sum_after=%f\n", kt, row_max, row_sum) softmax_done_bar.arrive() # Wait for MMA's PV[N-1] to commit before reading O for normalize.