diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
index f42693d4..1580525f 100644
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -465,7 +465,7 @@ class FmhaKernel:
                             )
                             cute.copy(tiled_tmem_load_o, tTMEM_LOADtO_i, tTMrO_i)
                             for k in cutlass.range(cute.size(tTMrO_i), vectorize=True):
-                                tTMrO_i[k] = tTMrO_i[k] * acc_scale
+                                tTMrO_i[k] = tTMrO_i[k] * Float32(1.0)  # DEBUG: NO-OP round-trip test
                             cute.copy(tiled_tmem_store_o, tTMrO_i, tTMEM_STOREtO_i)
                         cute.arch.fence_view_async_tmem_store()