diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index f42693d4..1580525f 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -465,7 +465,7 @@ class FmhaKernel: ) cute.copy(tiled_tmem_load_o, tTMEM_LOADtO_i, tTMrO_i) for k in cutlass.range(cute.size(tTMrO_i), vectorize=True): - tTMrO_i[k] = tTMrO_i[k] * acc_scale + tTMrO_i[k] = tTMrO_i[k] * Float32(1.0) # DEBUG: NO-OP round-trip test cute.copy(tiled_tmem_store_o, tTMrO_i, tTMEM_STOREtO_i) cute.arch.fence_view_async_tmem_store()