fix: tcgen05.wait::ld.sync.aligned (was missing 'sync')
This commit is contained in:
@@ -187,7 +187,7 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) {
|
||||
: "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]),
|
||||
"=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7])
|
||||
: "r"(tb + n * 8));
|
||||
asm volatile("tcgen05.wait::ld.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot
|
||||
asm volatile("tcgen05.wait::ld.sync.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot
|
||||
if (my_row_active) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int col = n * 8 + c;
|
||||
|
||||
Reference in New Issue
Block a user