Add graph replay debug prints for first 3 steps/layers

This commit is contained in:
2026-06-06 07:19:07 +00:00
parent 16b9a4def2
commit dcb2495a5b

View File

@@ -1877,9 +1877,21 @@ def main():
# Copy X into graph A input buffer (copy_ handles cross-GPU transfer)
graph_decoder.x_in_bufs[li].copy_(X)
# DEBUG: check input is non-zero (first 3 steps, first 3 layers)
if step < 3 and li < 3:
torch.cuda.synchronize()
print(f" Replay L{li}: x_in |X|={graph_decoder.x_in_bufs[li].abs().max().item():.2f}", flush=True)
# Replay graph A: mHC pre_block + RMSNorm + q_a/q_b/kv projections
graph_decoder.graphs_a[li].replay()
# DEBUG: check graph A output (first 3 steps, first 3 layers)
if step < 3 and li < 3:
torch.cuda.synchronize()
print(f" Replay L{li} GraphA: x_normed |X|={graph_decoder.x_normed_bufs[li].abs().max().item():.2f} "
f"q_heads |X|={graph_decoder.q_heads_bufs[li].abs().max().item():.2f} "
f"kv_3d |X|={graph_decoder.kv_3d_bufs[li].abs().max().item():.2f}", flush=True)
# ---- Eager attention (NOT captured) ----
# Read graph A outputs from pre-allocated buffers
x_normed = graph_decoder.x_normed_bufs[li]
@@ -1899,11 +1911,21 @@ def main():
# Write F_attn to graph B input buffer
graph_decoder.F_attn_bufs[li].copy_(F_attn)
# DEBUG: check F_attn (first 3 steps, first 3 layers)
if step < 3 and li < 3:
torch.cuda.synchronize()
print(f" Replay L{li} F_attn |X|={F_attn.abs().max().item():.2f}", flush=True)
# Replay graph B: mHC post_block + FFN + MoE + SE
graph_decoder.graphs_b[li].replay()
# Read output from graph B
X = graph_decoder.x_out_bufs[li]
# DEBUG: check graph B output (first 3 steps, first 3 layers)
if step < 3 and li < 3:
torch.cuda.synchronize()
print(f" Replay L{li} GraphB: x_out |X|={X.abs().max().item():.2f}", flush=True)
# Transfer last layer output to cuda:0 for lm_head graph
graph_decoder.x_lm_in.copy_(X)