diff --git a/tests/unit/test_fmha_v3_softmax.py b/tests/unit/test_fmha_v3_softmax.py
index 69d00f2f..a63df1ad 100644
--- a/tests/unit/test_fmha_v3_softmax.py
+++ b/tests/unit/test_fmha_v3_softmax.py
@@ -71,7 +71,7 @@ class FmhaV3Softmax:
         self.q_dtype = q.element_type; self.o_dtype = c.element_type; self.c_dtype = self.o_dtype
         self.a_major = LayoutEnum.from_tensor(q).mma_major_mode()
         self.b_major = LayoutEnum.from_tensor(k).mma_major_mode()
-        # s_k = cute.size(v, mode=[0])  # BROKEN in @cute.jit
+        # # s_k hardcoded  # BROKEN in @cute.jit
         # FMHA-style V: reconstruct as (HEAD_DIM, s_k, 1) MN-major
         v_fmha = cute.make_tensor(
             v.iterator,
@@ -373,9 +373,37 @@ class FmhaV3Softmax:
 
                 row_sum = row_sum + tile_sum
 
-            # --- C9: SKIPPED for debug (no normalization) ---
+            # --- C9: Final normalization via O TMEM rescale ---
             pv_done_bar.arrive_and_wait()
-            # O is unnormalized in TMEM. Use standard epilogue_tma_store with identity.
+            # Store final row_sum to TMEM vector (per-row, using QK partition)
+            tTMEM_STORE_VECrS_final = cute.make_rmem_tensor(tTMEM_STORE_VECcS.shape, self.qk_acc_dtype)
+            tTMEM_STORE_VECrS_final[0] = row_sum
+            cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS_final, tTMEM_STORE_VECtS)
+            cute.arch.fence_view_async_tmem_store()
+
+            # Read vector back: per-row row_sum using QK partition coordinates
+            tTMEM_LOAD_VECrS = cute.make_rmem_tensor(tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype)
+            cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS, tTMEM_LOAD_VECrS)
+            cute.arch.fence_view_async_tmem_load()
+            inv_row_sum = cutlass.Float32(1.0) / tTMEM_LOAD_VECrS[0]
+
+            # Normalize O in TMEM
+            tTMrO_final = cute.make_rmem_tensor((tTMEM_LOADcO.shape, o_col_tiles), self.qk_acc_dtype)
+            for i in range(o_col_tiles):
+                tTMrO_i_ = tTMrO_final[None, i]
+                tTMrO_i_layout = cute.composition(tTMrO_i_.layout, cute.make_layout(tTMrO_final.shape[0]))
+                tTMrO_i = cute.make_tensor(tTMrO_i_.iterator, tTMrO_i_layout)
+                tTMEM_LOADtO_i = cute.make_tensor(
+                    tTMEM_LOADtO.iterator + i * corr_tile_size, tTMEM_LOADtO.layout)
+                tTMEM_STOREtO_i = cute.make_tensor(
+                    tTMEM_STOREtO.iterator + i * corr_tile_size, tTMEM_STOREtO.layout)
+                cute.copy(o_tiled_tmem_load, tTMEM_LOADtO_i, tTMrO_i)
+                for j in cutlass.range(cute.size(tTMrO_i), vectorize=True):
+                    tTMrO_i[j] = tTMrO_i[j] * inv_row_sum
+                cute.copy(o_tiled_tmem_store, tTMrO_i, tTMEM_STOREtO_i)
+            cute.arch.fence_view_async_tmem_store()
+
+            # Now O in TMEM is normalized. Use standard epilogue_tma_store with identity.
             tCtO_base = cute.make_tensor(tmem_ptr + self.tmem_o0_offset, tCtO_fake.layout)
             acc_cons_st = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_acc_stage)
             c_grp = pipeline.CooperativeGroup(pipeline.Agent.Thread, 32 * len(self.epilogue_warp_id))
@@ -401,8 +429,7 @@ def test():
         c = torch.zeros(m, hd, 1, dtype=torch.bfloat16, device="cuda")
         qf = q[:,:,0].float(); kf = k[:,:,0].float()
         attn = qf @ kf.T / math.sqrt(hd)
-        P = torch.exp(attn - attn.max(dim=-1, keepdim=True)[0])
-        ref = P @ v.float()  # unnormalized P@V
+        ref = torch.softmax(attn, dim=-1) @ v.float()
         mQ = ct.from_dlpack(q).mark_layout_dynamic(leading_dim=ct.get_leading_dim(q))
         mK = ct.from_dlpack(k).mark_layout_dynamic(leading_dim=ct.get_leading_dim(k))
         mV = ct.from_dlpack(v_kernel).mark_layout_dynamic(leading_dim=ct.get_leading_dim(v_kernel))
@@ -418,7 +445,40 @@ def test():
         out = c[:,:,0].float()
         cos = torch.nn.functional.cosine_similarity(out.flatten().unsqueeze(0), ref.flatten().unsqueeze(0)).item()
         max_err = (out - ref).abs().max().item()
-        print(f"FMHA softmax (no C9 norm) n={n}: cosine {cos:.6f} max_err {max_err:.6f} {'PASS' if cos >= 0.999 else 'FAIL'}", flush=True)
+        print(f"FMHA softmax n={n}: cosine {cos:.6f} max_err {max_err:.6f} {'PASS' if cos >= 0.999 else 'FAIL'}", flush=True)
+
+if __name__ == "__main__":
+    test()
+
+
+def test():
+    import math
+    torch.manual_seed(42)
+    for n in [128, 256, 384]:
+        m, hd = 128, HEAD_DIM
+        q = torch.randn(m, hd, 1, dtype=torch.bfloat16, device="cuda")
+        k = torch.randn(n, hd, 1, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(n, hd, dtype=torch.bfloat16, device="cuda")
+        v_kernel = v.unsqueeze(-1)
+        c = torch.zeros(m, hd, 1, dtype=torch.bfloat16, device="cuda")
+        qf = q[:,:,0].float(); kf = k[:,:,0].float()
+        attn = qf @ kf.T / math.sqrt(hd)
+        ref = torch.softmax(attn, dim=-1) @ v.float()
+        mQ = ct.from_dlpack(q).mark_layout_dynamic(leading_dim=ct.get_leading_dim(q))
+        mK = ct.from_dlpack(k).mark_layout_dynamic(leading_dim=ct.get_leading_dim(k))
+        mV = ct.from_dlpack(v_kernel).mark_layout_dynamic(leading_dim=ct.get_leading_dim(v_kernel))
+        mC = ct.from_dlpack(c).mark_layout_dynamic(leading_dim=ct.get_leading_dim(c))
+        stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+        kernel = FmhaV3Softmax()
+        print(f"n={n}: Compiling...", flush=True)
+        compiled = cute.compile(kernel, mQ, mK, mV, mC, stream)
+        print(f"n={n}: Running...", flush=True)
+        compiled(mQ, mK, mV, mC, stream)
+        torch.cuda.synchronize()
+        out = c[:,:,0].float()
+        cos = torch.nn.functional.cosine_similarity(out.flatten().unsqueeze(0), ref.flatten().unsqueeze(0)).item()
+        max_err = (out - ref).abs().max().item()
+        print(f"FMHA softmax n={n}: cosine {cos:.6f} max_err {max_err:.6f} {'PASS' if cos >= 0.999 else 'FAIL'}", flush=True)
 
 if __name__ == "__main__":
     test()