diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 446a9340..2b3bfa5b 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -352,14 +352,14 @@ class FmhaKernel: cute.arch.fence_view_async_tmem_store() else: # SMEM-P: write P to sP using coordinate-indexed store. - # tTMEM_LOADcS contains (m, k) coordinates from identity tensor. - # Each element is an (m, k) coordinate pair. - # rP_bf16 has the same shape/layout as tTMEM_LOADcS (BF16 view of FP32 registers). for j0 in range(32): for j1 in range(4): coord = tTMEM_LOADcS[(j0, 0), j1, 0, 0] m_coord = coord[0] k_coord = coord[1] + # DEBUG: print first 8 coords from thread 0 + if sfw_idx == 0 and kt == 0 and j0 < 2 and j1 < 2: + print(f"[SMEM-P] j0={j0} j1={j1} m={m_coord} k={k_coord} P={rP_bf16[(j0, 0), j1, 0, 0]}") k0 = k_coord % 16 k1 = (k_coord // 16) % 4 k2 = k_coord // 64