From 7751eab7114cf9ef856f396502934edca3ba145b Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 23 May 2026 03:35:49 +0000 Subject: [PATCH] D1 fix: P store uses PV A-fragment layout (p_tmem_s.outer) CRITICAL BUG: P was stored using QK C-fragment composition layout, but PV A-fragment reads using p_tmem_s.outer (PV A-operand layout). These layouts match at hd=64 (cos 0.999998) but diverge at hd>64 (cos 0.784 at hd=128). The fix: tStP0 and tScP now use p_tmem_s.outer instead of composition(tStS.layout, (128, p_cols_fp32)). This ensures the softmax writes P in the same layout that the PV GEMM expects. --- dsv4/kernels/attention/fmha.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 0cbd2a02..b10a4c1e 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -225,14 +225,14 @@ class FmhaKernel: # P store atoms p_cols_fp32 = self.pv_mma_tiler[2] * self.q_dtype.width // self.qk_acc_dtype.width - tStP_layout = cute.composition(tStS.layout, cute.make_layout((self.pv_mma_tiler[0], p_cols_fp32))) - tStP0 = cute.make_tensor(tStS.iterator + self.tmem_p0_offset, tStP_layout) + # P store must use the PV A-fragment layout (p_tmem_s.outer), not the QK C-fragment layout. + # At hd=64 these match by coincidence; at hd>64 they diverge, causing garbage PV output. + tStP0 = cute.make_tensor(tStS.iterator + self.tmem_p0_offset, p_tmem_s.outer) tmem_store_atom = cute.make_copy_atom(tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(32)), self.qk_acc_dtype) tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tStP0) thr_store = tiled_tmem_store.get_slice(sfw_idx) tTMEM_STOREtP = thr_store.partition_D(tStP0) - tScP_layout = cute.composition(tScS.layout, cute.make_layout((self.pv_mma_tiler[0], p_cols_fp32))) - tScP = cute.make_tensor(tScS.iterator, tScP_layout) + tScP = cute.make_tensor(tScS.iterator, p_tmem_s.outer) tTMEM_STOREcP = thr_store.partition_S(tScP) row_max = -Float32.inf