diff --git a/single_shot_inference.py b/single_shot_inference.py
index 54734263..2cc733e8 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -1877,9 +1877,21 @@ def main():
                 # Copy X into graph A input buffer (copy_ handles cross-GPU transfer)
                 graph_decoder.x_in_bufs[li].copy_(X)
                 
+                # DEBUG: check input is non-zero (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li}: x_in |X|={graph_decoder.x_in_bufs[li].abs().max().item():.2f}", flush=True)
+                
                 # Replay graph A: mHC pre_block + RMSNorm + q_a/q_b/kv projections
                 graph_decoder.graphs_a[li].replay()
                 
+                # DEBUG: check graph A output (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} GraphA: x_normed |X|={graph_decoder.x_normed_bufs[li].abs().max().item():.2f} "
+                          f"q_heads |X|={graph_decoder.q_heads_bufs[li].abs().max().item():.2f} "
+                          f"kv_3d |X|={graph_decoder.kv_3d_bufs[li].abs().max().item():.2f}", flush=True)
+                
                 # ---- Eager attention (NOT captured) ----
                 # Read graph A outputs from pre-allocated buffers
                 x_normed = graph_decoder.x_normed_bufs[li]
@@ -1899,11 +1911,21 @@ def main():
                 # Write F_attn to graph B input buffer
                 graph_decoder.F_attn_bufs[li].copy_(F_attn)
                 
+                # DEBUG: check F_attn (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} F_attn |X|={F_attn.abs().max().item():.2f}", flush=True)
+                
                 # Replay graph B: mHC post_block + FFN + MoE + SE
                 graph_decoder.graphs_b[li].replay()
                 
                 # Read output from graph B
                 X = graph_decoder.x_out_bufs[li]
+                
+                # DEBUG: check graph B output (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} GraphB: x_out |X|={X.abs().max().item():.2f}", flush=True)
             
             # Transfer last layer output to cuda:0 for lm_head graph
             graph_decoder.x_lm_in.copy_(X)