Add graph replay debug prints for first 3 steps/layers
This commit is contained in:
@@ -1877,9 +1877,21 @@ def main():
|
||||
# Copy X into graph A input buffer (copy_ handles cross-GPU transfer)
|
||||
graph_decoder.x_in_bufs[li].copy_(X)
|
||||
|
||||
# DEBUG: check input is non-zero (first 3 steps, first 3 layers)
|
||||
if step < 3 and li < 3:
|
||||
torch.cuda.synchronize()
|
||||
print(f" Replay L{li}: x_in |X|={graph_decoder.x_in_bufs[li].abs().max().item():.2f}", flush=True)
|
||||
|
||||
# Replay graph A: mHC pre_block + RMSNorm + q_a/q_b/kv projections
|
||||
graph_decoder.graphs_a[li].replay()
|
||||
|
||||
# DEBUG: check graph A output (first 3 steps, first 3 layers)
|
||||
if step < 3 and li < 3:
|
||||
torch.cuda.synchronize()
|
||||
print(f" Replay L{li} GraphA: x_normed |X|={graph_decoder.x_normed_bufs[li].abs().max().item():.2f} "
|
||||
f"q_heads |X|={graph_decoder.q_heads_bufs[li].abs().max().item():.2f} "
|
||||
f"kv_3d |X|={graph_decoder.kv_3d_bufs[li].abs().max().item():.2f}", flush=True)
|
||||
|
||||
# ---- Eager attention (NOT captured) ----
|
||||
# Read graph A outputs from pre-allocated buffers
|
||||
x_normed = graph_decoder.x_normed_bufs[li]
|
||||
@@ -1899,11 +1911,21 @@ def main():
|
||||
# Write F_attn to graph B input buffer
|
||||
graph_decoder.F_attn_bufs[li].copy_(F_attn)
|
||||
|
||||
# DEBUG: check F_attn (first 3 steps, first 3 layers)
|
||||
if step < 3 and li < 3:
|
||||
torch.cuda.synchronize()
|
||||
print(f" Replay L{li} F_attn |X|={F_attn.abs().max().item():.2f}", flush=True)
|
||||
|
||||
# Replay graph B: mHC post_block + FFN + MoE + SE
|
||||
graph_decoder.graphs_b[li].replay()
|
||||
|
||||
# Read output from graph B
|
||||
X = graph_decoder.x_out_bufs[li]
|
||||
|
||||
# DEBUG: check graph B output (first 3 steps, first 3 layers)
|
||||
if step < 3 and li < 3:
|
||||
torch.cuda.synchronize()
|
||||
print(f" Replay L{li} GraphB: x_out |X|={X.abs().max().item():.2f}", flush=True)
|
||||
|
||||
# Transfer last layer output to cuda:0 for lm_head graph
|
||||
graph_decoder.x_lm_in.copy_(X)
|
||||
|
||||
Reference in New Issue
Block a user