diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 23142913..64430b14 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -186,6 +186,12 @@ class FmhaKernel: tCrP = pv_mma.make_fragment_A(sP) if self.use_smem_p: print(f"[SMEM-P DEBUG] tCrP shape: {cute.shape(tCrP)} layout: {tCrP.layout}") + # DEBUG: compute iterator offset between tCrP and sP + try: + offset_elems = tCrP.iterator - sP.iterator + print(f"[SMEM-P DEBUG] tCrP iterator offset: {offset_elems}") + except: + print(f"[SMEM-P DEBUG] iterator offset not available") # tOrP0 always defined as tOrP. The TMEM-P path in the MMA warp applies # the p0 column offset inline when constructing the gemm arguments. tOrP0 = tOrP