From 48d93a6d2e81b58b007cb20694ff7c2670d4db8d Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 1 Jun 2026 02:41:12 +0000 Subject: [PATCH] diag: MoE input/output diagnostics for first 3 layers --- single_shot_inference.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/single_shot_inference.py b/single_shot_inference.py index 1f47d0c1..9c011392 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -414,8 +414,14 @@ def moe_forward(x, li, moe_runner, se_runner, router, token_id): torch.cuda.synchronize(x.device) if topk_ids.max().item() >= 384 or topk_ids.min().item() < 0: print(f" L{li} BAD topk_ids: min={topk_ids.min().item()} max={topk_ids.max().item()}", flush=True) + if li < 3: + print(f" L{li} MoE input: |x|={x.abs().max().item():.4f} has_nan={torch.isnan(x).any().item()}", flush=True) routed_out = moe_runner.run(x, topk_w, topk_ids) + if li < 3: + print(f" L{li} MoE routed: |out|={routed_out.abs().max().item():.4f} has_nan={torch.isnan(routed_out).any().item()}", flush=True) shared_out = se_runner.run(x) + if li < 3: + print(f" L{li} MoE shared: |out|={shared_out.abs().max().item():.4f} has_nan={torch.isnan(shared_out).any().item()}", flush=True) return routed_out + shared_out # =====================================================================