diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 5cf57188..a0e81c94 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -558,7 +558,7 @@ class FmhaKernel: pv_done_bar.arrive_and_wait() # Wait for PV[kt-1] # Rescale O: load, multiply by acc_scale, store back to TMEM. # CUTLASS pattern: both copies use same tOtO_i (composition-tiled). - rescale_factor = acc_scale + rescale_factor = Float32(0.5) # DEBUG: force known value if const_expr(self.debug_noop_rescale): rescale_factor = Float32(1.0) n_slices = self.head_dim // corr_tile_size