diag: print FMHA output magnitude for first 3 layers
This commit is contained in:
@@ -353,6 +353,8 @@ def forward_attention(x_normed, w, li, cfg, rope_cos, rope_sin,
|
||||
|
||||
# 6. Production FMHA
|
||||
attn_out = _run_production_fmha(q_heads, all_kv, n_h, hd, T, seq_len, scale, dev, li, w, pfx)
|
||||
if li < 3:
|
||||
print(f" L{li} FMHA: |attn_out|={attn_out.abs().max().item():.6f} q_heads_range=[{q_heads.min().item():.3f},{q_heads.max().item():.3f}] all_kv_range=[{all_kv.min().item():.3f},{all_kv.max().item():.3f}] N={seq_len} hd={hd} scale={scale:.6f}", flush=True)
|
||||
# 7. Inverse RoPE
|
||||
attn_out = _apply_rope(attn_out, positions, rope_cos, rope_sin, rd, inverse=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user