fix: tcgen05.wait::ld.sync.aligned (was missing 'sync')

This commit is contained in:
2026-05-28 23:19:03 +00:00
parent 0220e51d18
commit 52809b0ec6

View File

@@ -187,7 +187,7 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) {
: "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]),
"=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7])
: "r"(tb + n * 8));
asm volatile("tcgen05.wait::ld.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot
asm volatile("tcgen05.wait::ld.sync.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot
if (my_row_active) {
for (int c = 0; c < 8; c++) {
int col = n * 8 + c;