From fa2d5708c59d75f6838e7d2fd3f00b3aea470760 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 19:29:42 +0000 Subject: [PATCH] debug: add L1 GEMM and SiLU output debug prints --- cutedsl/moe_pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cutedsl/moe_pipeline.py b/cutedsl/moe_pipeline.py index 1dd2dd73..e90b36c2 100644 --- a/cutedsl/moe_pipeline.py +++ b/cutedsl/moe_pipeline.py @@ -194,7 +194,8 @@ def run_nvfp4_moe( scale_a=l1_scale_a, scale_b=l1_scale_b, expert_offsets=expert_offsets, global_scale_a=l1_global_scale_a, global_scale_b=l1_global_scale_b, - ) # (num_slots, intermediate) BF16 + ) # (num_slots, 2*intermediate) BF16 + print(f" L1 GEMM output: shape={l1_out.shape}, amax={l1_out.abs().amax().item():.4f}", flush=True) # ════════════════════════════════════════════════════════════════ # SiLU(gate) * up (BF16 — nonlinear requires BF16) @@ -204,6 +205,7 @@ def run_nvfp4_moe( gate = l1_out[:, :intermediate_size] up = l1_out[:, intermediate_size:] activated = torch.nn.functional.silu(gate) * up # (num_slots, intermediate) BF16 + print(f" After SiLU(gate)*up: shape={activated.shape}, amax={activated.abs().amax().item():.4f}", flush=True) # ════════════════════════════════════════════════════════════════ # L2: down projection (NVFP4 × NVFP4 → BF16)