From b50f6a85120e5c83a6fbc03e1af8f1efb0ba3531 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 07:46:15 +0000 Subject: [PATCH] debug: add TMEM read diagnostic --- dsv4/kernels/attention/fmha_epilogue_sm100.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dsv4/kernels/attention/fmha_epilogue_sm100.cuh b/dsv4/kernels/attention/fmha_epilogue_sm100.cuh index 0c04d696..988bcfba 100644 --- a/dsv4/kernels/attention/fmha_epilogue_sm100.cuh +++ b/dsv4/kernels/attention/fmha_epilogue_sm100.cuh @@ -270,6 +270,9 @@ fmha_decode_tmem( float r2 = u32_to_f32(u2) * inv_sum; float r3 = u32_to_f32(u3) * inv_sum; + // Debug: print first few values from lane 0 + if (lane == 0 && col == 0) printf("[tmem] read: r0=%f r1=%f r2=%f r3=%f inv_sum=%f\n", r0, r1, r2, r3, inv_sum); + // Step 4: Cast to BF16 and write to GMEM int base = col * 128; int d0 = base + lane * 4 + 0;