diag: print FMHA output magnitude for first 3 layers

This commit is contained in:
2026-06-01 01:34:02 +00:00
parent 7fbbdc5204
commit efe63caea9

View File

@@ -353,6 +353,8 @@ def forward_attention(x_normed, w, li, cfg, rope_cos, rope_sin,
# 6. Production FMHA
attn_out = _run_production_fmha(q_heads, all_kv, n_h, hd, T, seq_len, scale, dev, li, w, pfx)
if li < 3:
print(f" L{li} FMHA: |attn_out|={attn_out.abs().max().item():.6f} q_heads_range=[{q_heads.min().item():.3f},{q_heads.max().item():.3f}] all_kv_range=[{all_kv.min().item():.3f},{all_kv.max().item():.3f}] N={seq_len} hd={hd} scale={scale:.6f}", flush=True)
# 7. Inverse RoPE
attn_out = _apply_rope(attn_out, positions, rope_cos, rope_sin, rd, inverse=True)