Add graph replay debug prints for first 3 steps/layers

2026-06-06 07:19:07 +00:00
parent 16b9a4def2
commit dcb2495a5b
1 changed files with 22 additions and 0 deletions
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -1877,9 +1877,21 @@ def main():
                # Copy X into graph A input buffer (copy_ handles cross-GPU transfer)
                graph_decoder.x_in_bufs[li].copy_(X)
                
+                # DEBUG: check input is non-zero (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li}: x_in |X|={graph_decoder.x_in_bufs[li].abs().max().item():.2f}", flush=True)
+                
                # Replay graph A: mHC pre_block + RMSNorm + q_a/q_b/kv projections
                graph_decoder.graphs_a[li].replay()
                
+                # DEBUG: check graph A output (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} GraphA: x_normed |X|={graph_decoder.x_normed_bufs[li].abs().max().item():.2f} "
+                          f"q_heads |X|={graph_decoder.q_heads_bufs[li].abs().max().item():.2f} "
+                          f"kv_3d |X|={graph_decoder.kv_3d_bufs[li].abs().max().item():.2f}", flush=True)
+                
                # ---- Eager attention (NOT captured) ----
                # Read graph A outputs from pre-allocated buffers
                x_normed = graph_decoder.x_normed_bufs[li]
@@ -1899,11 +1911,21 @@ def main():
                # Write F_attn to graph B input buffer
                graph_decoder.F_attn_bufs[li].copy_(F_attn)
                
+                # DEBUG: check F_attn (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} F_attn |X|={F_attn.abs().max().item():.2f}", flush=True)
+                
                # Replay graph B: mHC post_block + FFN + MoE + SE
                graph_decoder.graphs_b[li].replay()
                
                # Read output from graph B
                X = graph_decoder.x_out_bufs[li]
+                
+                # DEBUG: check graph B output (first 3 steps, first 3 layers)
+                if step < 3 and li < 3:
+                    torch.cuda.synchronize()
+                    print(f"  Replay L{li} GraphB: x_out |X|={X.abs().max().item():.2f}", flush=True)
            
            # Transfer last layer output to cuda:0 for lm_head graph
            graph_decoder.x_lm_in.copy_(X)