D1: DEBUG - NO-OP O rescale (multiply by 1.0) to test TMEM round-trip

This commit is contained in:
2026-05-24 22:19:16 +00:00
parent c33185ca0a
commit 18f3274c0b

View File

@@ -465,7 +465,7 @@ class FmhaKernel:
)
cute.copy(tiled_tmem_load_o, tTMEM_LOADtO_i, tTMrO_i)
for k in cutlass.range(cute.size(tTMrO_i), vectorize=True):
tTMrO_i[k] = tTMrO_i[k] * acc_scale
tTMrO_i[k] = tTMrO_i[k] * Float32(1.0) # DEBUG: NO-OP round-trip test
cute.copy(tiled_tmem_store_o, tTMrO_i, tTMEM_STOREtO_i)
cute.arch.fence_view_async_tmem_store()