debug: add TMEM read diagnostic
This commit is contained in:
@@ -270,6 +270,9 @@ fmha_decode_tmem(
|
||||
float r2 = u32_to_f32(u2) * inv_sum;
|
||||
float r3 = u32_to_f32(u3) * inv_sum;
|
||||
|
||||
// Debug: print first few values from lane 0
|
||||
if (lane == 0 && col == 0) printf("[tmem] read: r0=%f r1=%f r2=%f r3=%f inv_sum=%f\n", r0, r1, r2, r3, inv_sum);
|
||||
|
||||
// Step 4: Cast to BF16 and write to GMEM
|
||||
int base = col * 128;
|
||||
int d0 = base + lane * 4 + 0;
|
||||
|
||||
Reference in New Issue
Block a user