D1: DEBUG - NO-OP O rescale (multiply by 1.0) to test TMEM round-trip
This commit is contained in:
@@ -465,7 +465,7 @@ class FmhaKernel:
|
||||
)
|
||||
cute.copy(tiled_tmem_load_o, tTMEM_LOADtO_i, tTMrO_i)
|
||||
for k in cutlass.range(cute.size(tTMrO_i), vectorize=True):
|
||||
tTMrO_i[k] = tTMrO_i[k] * acc_scale
|
||||
tTMrO_i[k] = tTMrO_i[k] * Float32(1.0) # DEBUG: NO-OP round-trip test
|
||||
cute.copy(tiled_tmem_store_o, tTMrO_i, tTMEM_STOREtO_i)
|
||||
cute.arch.fence_view_async_tmem_store()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user