D1.5 debug: force rescale_factor=0.5 to test if round-trip code executes

This commit is contained in:
2026-05-26 20:29:34 +00:00
parent 3be708d923
commit 34d64137ec

View File

@@ -558,7 +558,7 @@ class FmhaKernel:
pv_done_bar.arrive_and_wait() # Wait for PV[kt-1]
# Rescale O: load, multiply by acc_scale, store back to TMEM.
# CUTLASS pattern: both copies use same tOtO_i (composition-tiled).
rescale_factor = acc_scale
rescale_factor = Float32(0.5) # DEBUG: force known value
if const_expr(self.debug_noop_rescale):
rescale_factor = Float32(1.0)
n_slices = self.head_dim // corr_tile_size