From eb69c3bfb90a534da7d2651ace376927bb1122af Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 3 Jun 2026 03:00:57 +0000 Subject: [PATCH] CRITICAL FIX: add missing tb base in QK TMEM read address prefill_read_qk_rows was reading from address 0 (sg_off + n * 8) instead of tb + sg_off + n * 8. This caused garbage QK values, explaining the 0.928 cosine for T=1 and NaN for T>1. --- dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh b/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh index 57e413c6..53804acd 100644 --- a/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh +++ b/dsv4/kernels/attention/fmha_mixed_fp8_prefill.cuh @@ -106,7 +106,7 @@ __device__ void prefill_read_qk_rows(uint32_t tb, float* sLogits, asm volatile("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0,%1,%2,%3,%4,%5,%6,%7},[%8];" : "=f"(tmp[0]),"=f"(tmp[1]),"=f"(tmp[2]),"=f"(tmp[3]), "=f"(tmp[4]),"=f"(tmp[5]),"=f"(tmp[6]),"=f"(tmp[7]) - : "r"(sg_off + n * 8)); + : "r"(tb + sg_off + n * 8)); asm volatile("tcgen05.wait::ld.sync.aligned;" ::: "memory"); int row = warp_row + lane;