diff --git a/dsv4/kernels/attention/fmha_6warp_multirow.cuh b/dsv4/kernels/attention/fmha_6warp_multirow.cuh index e9567ee6..388bff9d 100644 --- a/dsv4/kernels/attention/fmha_6warp_multirow.cuh +++ b/dsv4/kernels/attention/fmha_6warp_multirow.cuh @@ -187,7 +187,7 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) { : "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]), "=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7]) : "r"(tb + n * 8)); - asm volatile("tcgen05.wait::ld.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot + asm volatile("tcgen05.wait::ld.sync.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot if (my_row_active) { for (int c = 0; c < 8; c++) { int col = n * 8 + c;