diff --git a/single_shot_inference.py b/single_shot_inference.py index 54734263..2cc733e8 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -1877,9 +1877,21 @@ def main(): # Copy X into graph A input buffer (copy_ handles cross-GPU transfer) graph_decoder.x_in_bufs[li].copy_(X) + # DEBUG: check input is non-zero (first 3 steps, first 3 layers) + if step < 3 and li < 3: + torch.cuda.synchronize() + print(f" Replay L{li}: x_in |X|={graph_decoder.x_in_bufs[li].abs().max().item():.2f}", flush=True) + # Replay graph A: mHC pre_block + RMSNorm + q_a/q_b/kv projections graph_decoder.graphs_a[li].replay() + # DEBUG: check graph A output (first 3 steps, first 3 layers) + if step < 3 and li < 3: + torch.cuda.synchronize() + print(f" Replay L{li} GraphA: x_normed |X|={graph_decoder.x_normed_bufs[li].abs().max().item():.2f} " + f"q_heads |X|={graph_decoder.q_heads_bufs[li].abs().max().item():.2f} " + f"kv_3d |X|={graph_decoder.kv_3d_bufs[li].abs().max().item():.2f}", flush=True) + # ---- Eager attention (NOT captured) ---- # Read graph A outputs from pre-allocated buffers x_normed = graph_decoder.x_normed_bufs[li] @@ -1899,11 +1911,21 @@ def main(): # Write F_attn to graph B input buffer graph_decoder.F_attn_bufs[li].copy_(F_attn) + # DEBUG: check F_attn (first 3 steps, first 3 layers) + if step < 3 and li < 3: + torch.cuda.synchronize() + print(f" Replay L{li} F_attn |X|={F_attn.abs().max().item():.2f}", flush=True) + # Replay graph B: mHC post_block + FFN + MoE + SE graph_decoder.graphs_b[li].replay() # Read output from graph B X = graph_decoder.x_out_bufs[li] + + # DEBUG: check graph B output (first 3 steps, first 3 layers) + if step < 3 and li < 3: + torch.cuda.synchronize() + print(f" Replay L{li} GraphB: x_out |X|={X.abs().max().item():.2f}", flush=True) # Transfer last layer output to cuda:0 for lm_head graph graph_decoder.x_lm_in.copy_(X)