fix: cute.copy(dst, src) order

This commit is contained in:
2026-05-23 03:51:00 +00:00
parent 6996abcef3
commit 373c395810

View File

@@ -250,7 +250,7 @@ class FmhaKernel:
for j in cutlass.range(cute.size(rP_bf16_reg), vectorize=True):
# TODO: proper element mapping from QK→PV partition
rP_bf16_reg[j] = BFloat16(0.0)
cute.copy(rP_bf16_reg, tCrP_smem)
cute.copy(tCrP_smem, rP_bf16_reg)
cute.arch.fence_proxy("async.shared", space="cta")
si_handle.release()