diff --git a/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh b/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh index 57e413c6..53804acd 100644 --- a/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh +++ b/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh @@ -106,7 +106,7 @@ __device__ void prefill_read_qk_rows(uint32_t tb, float* sLogits, asm volatile("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0,%1,%2,%3,%4,%5,%6,%7},[%8];" : "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]), "=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7]) - : "r"(sg_off + n * 8)); + : "r"(tb + sg_off + n * 8)); asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory"); int row = warp_row + lane;