From e397386ba2643297078cc2149cc8026ba67bd20d Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 23 May 2026 05:18:37 +0000 Subject: [PATCH] Fix TMEM-P offset calc: match Stage C with p_cols_fp32 from pv_mma_tiler[2] --- dsv4/kernels/attention/fmha.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 6f9b3c98..667318a2 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -72,8 +72,10 @@ class FmhaKernel: self.tmem_s0_offset = 0 self.tmem_p0_offset = 32 s_cols = self.qk_mma_tiler[1] - p_cols = self.pv_mma_tiler[1] * self.q_dtype.width // self.qk_acc_dtype.width - self.tmem_o0_offset = max(s_cols, p_cols) + p_cols_fp32 = self.pv_mma_tiler[2] * self.q_dtype.width // self.qk_acc_dtype.width + p_end = self.tmem_p0_offset + p_cols_fp32 + o_after = max(s_cols, p_end) + self.tmem_o0_offset = ((o_after + 31) // 32) * 32 o_cols = find_tmem_tensor_col_offset(tOtO) total = self.tmem_o0_offset + o_cols else: