D1.5 debug: force rescale_factor=0.5 to test if round-trip code executes
This commit is contained in:
@@ -558,7 +558,7 @@ class FmhaKernel:
|
||||
pv_done_bar.arrive_and_wait() # Wait for PV[kt-1]
|
||||
# Rescale O: load, multiply by acc_scale, store back to TMEM.
|
||||
# CUTLASS pattern: both copies use same tOtO_i (composition-tiled).
|
||||
rescale_factor = acc_scale
|
||||
rescale_factor = Float32(0.5) # DEBUG: force known value
|
||||
if const_expr(self.debug_noop_rescale):
|
||||
rescale_factor = Float32(1.0)
|
||||
n_slices = self.head_dim // corr_tile_size
|
||||
|
||||
Reference in New Issue
Block a user