feat: SMEM-P via gP→TMA→sP path (clean implementation)

This commit is contained in:
2026-05-24 03:42:11 +00:00
parent 18ab507896
commit 248a827d0d

View File

@@ -89,7 +89,7 @@ class FmhaKernel:
cute.size_in_bytes(self.q_dtype, v_s)) * cta
@cute.jit
def __call__(self, q, k, v, c, stream, lse=None):
def __call__(self, q, k, v, c, stream, lse=None, gP=None):
self.q_dtype = q.element_type; self.o_dtype = c.element_type; self.c_dtype = self.o_dtype
self.a_major = LayoutEnum.from_tensor(q).mma_major_mode()
self.b_major = LayoutEnum.from_tensor(k).mma_major_mode()
@@ -113,19 +113,31 @@ class FmhaKernel:
tma_v,mV = cute.nvgpu.make_tiled_tma_atom_B(utils.sm100.cluster_shape_to_tma_atom_B(self.cluster_shape_mn,pv_mma.thr_id),v_fmha,v_s,self.pv_mma_tiler,pv_mma,self.cluster_layout_vmnk.shape)
epi_s = cute.select(self.c_smem_s,mode=[0,1])
tma_c,mC = cpasync.make_tiled_tma_atom(cpasync.CopyBulkTensorTileS2GOp(),c,epi_s,self.epi_tile)
# SMEM-P: TMA for P (GMEM→SMEM). gP is passed by the caller.
if const_expr(self.use_smem_p):
p_s = cute.slice_(self.p_smem_s,(None,None,None,0))
tma_p,gP = cute.nvgpu.make_tiled_tma_atom_A(
utils.sm100.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, pv_mma.thr_id),
gP, p_s, self.qk_mma_tiler, pv_mma, self.cluster_layout_vmnk.shape
)
else:
tma_p = tma_q # dummy, dead code
# Always create a valid mLSE tensor for the kernel.
# CuTeDSL doesn't support None parameters in @cute.kernel.
# For normalize=True, mLSE is unused (dead-code-eliminated by compiler).
if const_expr(lse is None):
lse = cute.make_tensor(c.iterator, cute.make_layout((1,), stride=(0,)))
self._kernel(qk_mma,pv_mma,tma_q,mQ,tma_k,mK,tma_v,mV,tma_c,mC,self.cluster_layout_vmnk,self.q_smem_s,self.k_smem_s,self.v_smem_s,self.p_tmem_s,self.p_smem_s,self.c_smem_s,self.epi_tile,lse).launch(grid=(1,1,1),block=[self.threads_per_cta,1,1],stream=stream)
self._kernel(qk_mma,pv_mma,tma_q,mQ,tma_k,mK,tma_v,mV,tma_c,mC,tma_p,gP,self.cluster_layout_vmnk,self.q_smem_s,self.k_smem_s,self.v_smem_s,self.p_tmem_s,self.p_smem_s,self.c_smem_s,self.epi_tile,lse).launch(grid=(1,1,1),block=[self.threads_per_cta,1,1],stream=stream)
@cute.kernel
def _kernel(self, qk_mma, pv_mma, tma_q, mQ, tma_k, mK, tma_v, mV, tma_c, mC, cl_vmnk, q_smem_s, k_smem_s, v_smem_s, p_tmem_s, p_smem_s, c_smem_s, epi_tile, mLSE):
def _kernel(self, qk_mma, pv_mma, tma_q, mQ, tma_k, mK, tma_v, mV, tma_c, mC, tma_p, mGP, cl_vmnk, q_smem_s, k_smem_s, v_smem_s, p_tmem_s, p_smem_s, c_smem_s, epi_tile, mLSE):
warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
tidx,_,_ = cute.arch.thread_idx()
if warp_idx == self.tma_warp_id:
cpasync.prefetch_descriptor(tma_q); cpasync.prefetch_descriptor(tma_k); cpasync.prefetch_descriptor(tma_v); cpasync.prefetch_descriptor(tma_c)
if const_expr(self.use_smem_p):
cpasync.prefetch_descriptor(tma_p)
@cute.struct
class SS:
@@ -225,6 +237,12 @@ class FmhaKernel:
cute.arch.fence_view_async_tmem_store()
sh.commit()
softmax_done_bar.arrive_and_wait()
# SMEM-P: TMA load gP → sP after softmax writes gP
if const_expr(self.use_smem_p):
tPgP, tPsP = cpasync.tma_partition(tma_p, 0, cute.nvgpu.OperandMajorMode.M, cute.group_modes(sP,0,3), cute.group_modes(mGP,0,3))
cute.copy(tma_p, tPsP[(None,0,None,0)], tPgP[(None,0,None,0)], tma_bar_ptr=st.s_bar.data_ptr())
cpasync.commit_group()
cpasync.wait_group(0)
pv_mma.set(tcgen05.Field.ACCUMULATE, kt != 0)
if not self.use_smem_p:
# TMEM-P: PV reads P from TMEM
@@ -368,16 +386,17 @@ class FmhaKernel:
cute.copy(tiled_tmem_store, rP_words, tTMEM_STOREtP)
cute.arch.fence_view_async_tmem_store()
else:
# SMEM-P: write P to sP using coordinate-indexed store.
for j0 in range(32):
for j1 in range(4):
coord = tTMEM_LOADcS[(j0, 0), j1, 0, 0]
m_coord = coord[0]
k_coord = coord[1]
k0 = k_coord % 16
k1 = (k_coord // 16) % 4
k2 = k_coord // 64
_sP_nostage[(m_coord, k0), 0, (k1, k2)] = rP_bf16[(j0, 0), j1, 0, 0]
# SMEM-P: Write P to gP (global memory), then TMA loads gP→sP.
# rP_bf16 and gP's partition are both derived from the QK C-fragment,
# so they have the same thread→value mapping. Element-wise copy works.
gP_tile = cute.local_tile(mGP, (128, self.s_k), (0, 0))
tCgP = qk_thr.partition_C(gP_tile)
# Copy rP_bf16 → tCgP element-by-element (both 128 values per thread)
rP_flat = cute.make_tensor(rP_bf16.iterator, cute.coalesce(rP_bf16.layout))
gP_flat = cute.make_tensor(tCgP.iterator, cute.coalesce(tCgP.layout))
for idx in cutlass.range(cute.size(rP_flat), vectorize=True):
gP_flat[idx] = rP_flat[idx]
cute.arch.fence_proxy("async.global", space="cta")
cute.arch.fence_proxy("async.shared", space="cta")
if kt > 0:
for i in range(n_corr_tiles):