Add compact per-layer residual trace (GROWTH_DIAG), disable verbose ATTN_DIAG
This commit is contained in:
@@ -65,8 +65,8 @@ SKIP_ROUTED_MOE = _args.skip_moe # If True, only use shared expert (debug)
|
||||
INVERSE_ROPE = not _args.no_inverse_rope # If False, skip inverse RoPE on attention output (diagnostic)
|
||||
SKIP_MHC = _args.skip_mhc # If True, bypass mHC and use simple residual connections (diagnostic)
|
||||
MHC_DIAG = False # If True, print per-layer mHC diagnostics (B_l row/col sums, C_l values)
|
||||
GROWTH_DIAG = True # If True, print per-layer residual growth analysis
|
||||
ATTN_DIAG = True # If True, print per-layer attention entropy
|
||||
GROWTH_DIAG = True # If True, print compact per-layer residual trace
|
||||
ATTN_DIAG = False # If True, print per-layer attention entropy (expensive)
|
||||
# When True: applies inverse RoPE at query position → converts absolute→relative
|
||||
# When False: leaves relative position encoding intact for output projection
|
||||
# DSV4 partial RoPE only affects last 64/512 dims; first 448 are always un-RoPE'd
|
||||
@@ -605,6 +605,13 @@ def forward_layer(X_l, w, li, cfg, rope_cos, rope_sin,
|
||||
B_l_ffn, C_l_ffn = ffn_ctx.B_l, ffn_ctx.C_l
|
||||
print(f" L{li} ffn: |X_mid|={X_mid.abs().max().item():.2f} |F_ffn|={F_ffn.abs().max().item():.2f} |B|={B_l_ffn.abs().max().item():.4f} |C|={C_l_ffn.abs().max().item():.4f} |X_next|={X_next.abs().max().item():.2f}", flush=True)
|
||||
|
||||
if GROWTH_DIAG:
|
||||
x_max = X_next.abs().max().item()
|
||||
xi_max = X_l.abs().max().item()
|
||||
fa_max = F_attn.abs().max().item()
|
||||
ff_max = F_ffn.abs().max().item()
|
||||
print(f" L{li}: |X|={xi_max:.1f}→{x_max:.1f} |Fa|={fa_max:.1f} |Ff|={ff_max:.1f}")
|
||||
|
||||
return X_next
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user