diag: NaN detection in shared expert gate/up split
This commit is contained in:
@@ -321,6 +321,10 @@ class Nvfp4SharedExpert:
|
||||
|
||||
gate = l1_out[:, :self.intermediate_size]
|
||||
up = l1_out[:, self.intermediate_size:]
|
||||
if torch.isnan(l1_out).any():
|
||||
print(f" SE L1 NaN: l1_out nan at {torch.isnan(l1_out).sum().item()} / {l1_out.numel()} positions, shape={l1_out.shape}", flush=True)
|
||||
if torch.isnan(gate).any() or torch.isnan(up).any():
|
||||
print(f" SE gate nan={torch.isnan(gate).any().item()} up nan={torch.isnan(up).any().item()}", flush=True)
|
||||
if self.swiglu_limit is not None:
|
||||
# Match SiluAndMulWithClamp: clamp gate BEFORE silu, clamp up to [-limit, limit]
|
||||
gate = gate.clamp(max=self.swiglu_limit)
|
||||
|
||||
Reference in New Issue
Block a user