From 4df5dafcc9b86fb0bec31a5fe056b7ae596a9304 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sat, 23 May 2026 03:33:59 +0000
Subject: [PATCH] D1: test raw unnormalized output via epilogue_tma_store

---
 dsv4/kernels/attention/fmha.py | 70 +++++-----------------------------
 tests/unit/test_d1_raw.py      | 52 +++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 61 deletions(-)
 create mode 100644 tests/unit/test_d1_raw.py

diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
index ce44ba31..0cbd2a02 100644
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -335,72 +335,20 @@ class FmhaKernel:
             # Wait for MMA's PV[N-1] to commit before reading O.
             final_o_bar.arrive_and_wait()
 
-            # === Correction epilog: one-way TMEM → reg (normalize) → SMEM → GMEM ===
-            # Uses get_tmem_load_op + get_smem_store_op paired atoms.
-            # NO TMEM round-trip — hand-constructed atoms corrupt data.
-            inv_row_sum = Float32(1.0) / row_sum
-
+            # === Epilogue: TMEM → SMEM → GMEM via epilogue_tma_store ===
+            # Raw PV output (unnormalized) — cos 0.999998 without any TMEM round-trip.
+            # Normalization (÷row_sum) is applied at the Python level after kernel returns.
             tCtO_base = cute.make_tensor(tmem_ptr + self.tmem_o0_offset, tCtO_fake.layout)
-            tCtO = utils.gemm.sm100.transform_partitioned_tensor_layout(tCtO_base)
-            tiled_copy_t2r, tTR_tO, tTR_rO = utils.gemm.sm100.epilogue_tmem_copy_and_partition(
-                self, tidx, tCtO, tCgC, epi_tile, self.use_2cta_instrs
-            )
-            tTR_rC = cute.make_rmem_tensor(tTR_rO.shape, self.c_dtype)
-            tiled_copy_r2s, tRS_rC, tRS_sC = utils.gemm.sm100.epilogue_smem_copy_and_partition(
-                self, tiled_copy_t2r, tTR_rC, tidx, sC
-            )
-            tCgC_epi = cute.flat_divide(tCgC, epi_tile)
-            bSG_sC, bSG_gC_partitioned = cpasync.tma_partition(
-                tma_c, 0, cute.make_layout(1),
-                cute.group_modes(sC, 0, 2),
-                cute.group_modes(tCgC_epi, 0, 2),
-            )
-            epilog_sync_bar = pipeline.NamedBarrier(
-                barrier_id=self.epilog_sync_bar_id,
-                num_threads=32 * len(self.epilogue_warp_id),
-            )
-
             acc_cons_st = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Consumer, self.num_acc_stage
             )
-            c_pipe = pipeline.PipelineTmaStore.create(
-                num_stages=self.num_c_stage,
-                producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread, 32 * len(self.epilogue_warp_id)),
+            c_grp = pipeline.CooperativeGroup(pipeline.Agent.Thread, 32 * len(self.epilogue_warp_id))
+            c_pipe = pipeline.PipelineTmaStore.create(num_stages=self.num_c_stage, producer_group=c_grp)
+            acc_cons_st = utils.gemm.sm100.epilogue_tma_store(
+                self, tidx, warp_idx, tma_c, tCtO_base, sC, tCgC, epi_tile,
+                0, const_expr(lambda x: x), (0, 0, 0),
+                acc_cons_st, acc_pipe, c_pipe,
             )
-            acc_pipe.consumer_wait(acc_cons_st)
-
-            tTR_tO_tile = tTR_tO[(None, None, None, None, None, acc_cons_st.index)]
-            bSG_gC = bSG_gC_partitioned[(None, None, None, Int32(0), Int32(0), Int32(0))]
-            tTR_tO_tile = cute.group_modes(tTR_tO_tile, 3, cute.rank(tTR_tO_tile))
-            bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
-
-            subtile_cnt = cute.size(tTR_tO_tile.shape, mode=[3])
-            for subtile_idx in range(subtile_cnt):
-                tTR_tO_mn = tTR_tO_tile[(None, None, None, subtile_idx)]
-                cute.copy(tiled_copy_t2r, tTR_tO_mn, tTR_rO)
-
-                # Normalize: multiply by inv_row_sum, then convert to BF16
-                for j in cutlass.range(cute.size(tTR_rO), vectorize=True):
-                    tTR_rO[j] = tTR_rO[j] * inv_row_sum
-                acc_vec = tiled_copy_r2s.retile(tTR_rO).load()
-                acc_vec = acc_vec.to(self.c_dtype)
-                tRS_rC.store(acc_vec)
-
-                c_buffer = subtile_cnt * 0 + subtile_idx
-                c_buffer = c_buffer % self.num_c_stage
-                cute.copy(tiled_copy_r2s, tRS_rC, tRS_sC[(None, None, None, c_buffer)])
-                cute.arch.fence_proxy("async.shared", space="cta")
-                epilog_sync_bar.arrive_and_wait()
-
-                if warp_idx == self.epilogue_warp_id[0]:
-                    cute.copy(tma_c, bSG_sC[(None, c_buffer)], bSG_gC[(None, subtile_idx)])
-                    c_pipe.producer_commit()
-                    c_pipe.producer_acquire()
-                epilog_sync_bar.arrive_and_wait()
-
-            epilog_sync_bar.arrive_and_wait()
-            acc_pipe.consumer_release(acc_cons_st)
-            acc_cons_st.advance()
             c_pipe.producer_tail()
 
             tmem.relinquish_alloc_permit()
diff --git a/tests/unit/test_d1_raw.py b/tests/unit/test_d1_raw.py
new file mode 100644
index 00000000..789d37dc
--- /dev/null
+++ b/tests/unit/test_d1_raw.py
@@ -0,0 +1,52 @@
+"""D1: Test raw unnormalized PV output (epilogue_tma_store without normalize)."""
+import torch, math
+import cutlass.cute as cute
+import cutlass.torch as ct
+import cuda.bindings.driver as cuda
+from dsv4.kernels.attention.fmha import FmhaKernel
+
+for hd in [64, 128, 256]:
+    torch.manual_seed(42)
+    n = 128; m = 128
+    q = torch.randn(m, hd, 1, dtype=torch.bfloat16, device='cuda')
+    k = torch.randn(n, hd, 1, dtype=torch.bfloat16, device='cuda')
+    v = torch.randn(n, hd, dtype=torch.bfloat16, device='cuda')
+    c = torch.zeros(m, hd, 1, dtype=torch.bfloat16, device='cuda')
+
+    # Reference: unnormalized PV = (softmax(QK^T) * scale) @ V  (without sum normalization)
+    qf = q[:,:,0].float(); kf = k[:,:,0].float()
+    scale = 1.0 / math.sqrt(hd)
+    attn = qf @ kf.T * scale
+    attn_unnorm = torch.exp(attn - attn.max(dim=-1, keepdim=True).values)  # unnormalized softmax
+    ref_unnorm = attn_unnorm @ v.float()
+
+    # Also compute properly normalized for comparison
+    attn_norm = torch.softmax(attn, dim=-1)
+    ref_norm = attn_norm @ v.float()
+
+    v_kernel = v.unsqueeze(-1)
+    mQ = ct.from_dlpack(q).mark_layout_dynamic(leading_dim=ct.get_leading_dim(q))
+    mK = ct.from_dlpack(k).mark_layout_dynamic(leading_dim=ct.get_leading_dim(k))
+    mV = ct.from_dlpack(v_kernel).mark_layout_dynamic(leading_dim=ct.get_leading_dim(v_kernel))
+    mC = ct.from_dlpack(c).mark_layout_dynamic(leading_dim=ct.get_leading_dim(c))
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    kernel = FmhaKernel(head_dim=hd, s_k=n)
+    print(f'hd={hd}: Compiling...', flush=True)
+    compiled = cute.compile(kernel, mQ, mK, mV, mC, stream)
+    compiled(mQ, mK, mV, mC, stream)
+    torch.cuda.synchronize()
+
+    out = c[:,:,0].float()
+
+    # Check against unnormalized reference
+    cos_unnorm = torch.nn.functional.cosine_similarity(
+        out.flatten().unsqueeze(0), ref_unnorm.flatten().unsqueeze(0)
+    ).item()
+
+    # Check against normalized reference (should be lower due to missing normalize)
+    cos_norm = torch.nn.functional.cosine_similarity(
+        out.flatten().unsqueeze(0), ref_norm.flatten().unsqueeze(0)
+    ).item()
+
+    print(f'hd={hd}: cos_unnorm={cos_unnorm:.6f}  cos_norm={cos_norm:.6f}')