diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index 26472412..be986354 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -321,6 +321,10 @@ class Nvfp4SharedExpert: gate = l1_out[:, :self.intermediate_size] up = l1_out[:, self.intermediate_size:] + if torch.isnan(l1_out).any(): + print(f" SE L1 NaN: l1_out nan at {torch.isnan(l1_out).sum().item()} / {l1_out.numel()} positions, shape={l1_out.shape}", flush=True) + if torch.isnan(gate).any() or torch.isnan(up).any(): + print(f" SE gate nan={torch.isnan(gate).any().item()} up nan={torch.isnan(up).any().item()}", flush=True) if self.swiglu_limit is not None: # Match SiluAndMulWithClamp: clamp gate BEFORE silu, clamp up to [-limit, limit] gate = gate.clamp(max=self.swiglu_limit)