From 52809b0ec658776991100ffa9c05bb64000321f3 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 23:19:03 +0000 Subject: [PATCH] fix: tcgen05.wait::ld.sync.aligned (was missing 'sync') --- dsv4/kernels/attention/fmha_6warp_multirow.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsv4/kernels/attention/fmha_6warp_multirow.cuh b/dsv4/kernels/attention/fmha_6warp_multirow.cuh index e9567ee6..388bff9d 100644 --- a/dsv4/kernels/attention/fmha_6warp_multirow.cuh +++ b/dsv4/kernels/attention/fmha_6warp_multirow.cuh @@ -187,7 +187,7 @@ fmha_6warp_multirow_kernel(FmhaMultiRowParams params) { : "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]), "=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7]) : "r"(tb + n * 8)); - asm volatile("tcgen05.wait::ld.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot + asm volatile("tcgen05.wait::ld.sync.aligned;"); // NOTE: ld not ld.sync — some B200 builds need the dot if (my_row_active) { for (int c = 0; c < 8; c++) { int col = n * 8 + c;