diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
index 0cbd2a02..b10a4c1e 100644
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -225,14 +225,14 @@ class FmhaKernel:
 
             # P store atoms
             p_cols_fp32 = self.pv_mma_tiler[2] * self.q_dtype.width // self.qk_acc_dtype.width
-            tStP_layout = cute.composition(tStS.layout, cute.make_layout((self.pv_mma_tiler[0], p_cols_fp32)))
-            tStP0 = cute.make_tensor(tStS.iterator + self.tmem_p0_offset, tStP_layout)
+            # P store must use the PV A-fragment layout (p_tmem_s.outer), not the QK C-fragment layout.
+            # At hd=64 these match by coincidence; at hd>64 they diverge, causing garbage PV output.
+            tStP0 = cute.make_tensor(tStS.iterator + self.tmem_p0_offset, p_tmem_s.outer)
             tmem_store_atom = cute.make_copy_atom(tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(32)), self.qk_acc_dtype)
             tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tStP0)
             thr_store = tiled_tmem_store.get_slice(sfw_idx)
             tTMEM_STOREtP = thr_store.partition_D(tStP0)
-            tScP_layout = cute.composition(tScS.layout, cute.make_layout((self.pv_mma_tiler[0], p_cols_fp32)))
-            tScP = cute.make_tensor(tScS.iterator, tScP_layout)
+            tScP = cute.make_tensor(tScS.iterator, p_tmem_s.outer)
             tTMEM_STOREcP = thr_store.partition_S(tScP)
 
             row_max = -Float32.inf