From 8deb987a3fa3f34ee6bef3bb1946cf7adca0a0ba Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 23 May 2026 05:16:19 +0000 Subject: [PATCH] Fix p_cols_fp32: use pv_mma_tiler[2] (K-dim) not [1] (N-dim) --- dsv4/kernels/attention/fmha.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 47bfee28..d356ac25 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -346,7 +346,7 @@ class FmhaKernel: # ── P store setup (always define both paths — CuTeDSL scoping) ── # TMEM-P: register bridge for P → TMEM - p_cols_fp32 = self.pv_mma_tiler[1] * self.q_dtype.width // self.qk_acc_dtype.width + p_cols_fp32 = self.pv_mma_tiler[2] * self.q_dtype.width // self.qk_acc_dtype.width tStP_layout = cute.composition(tStS.layout, cute.make_layout((self.pv_mma_tiler[0], p_cols_fp32))) tStP0 = cute.make_tensor(tStS.iterator + self.tmem_p0_offset, tStP_layout) tmem_store_atom = cute.make_copy_atom(