fix: cute.copy(dst, src) order
This commit is contained in:
@@ -250,7 +250,7 @@ class FmhaKernel:
|
||||
for j in cutlass.range(cute.size(rP_bf16_reg), vectorize=True):
|
||||
# TODO: proper element mapping from QK→PV partition
|
||||
rP_bf16_reg[j] = BFloat16(0.0)
|
||||
cute.copy(rP_bf16_reg, tCrP_smem)
|
||||
cute.copy(tCrP_smem, rP_bf16_reg)
|
||||
cute.arch.fence_proxy("async.shared", space="cta")
|
||||
|
||||
si_handle.release()
|
||||
|
||||
Reference in New Issue
Block a user