diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index 930b5674..26472412 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -316,6 +316,8 @@ class Nvfp4SharedExpert: self._ensure_initialized() l1_out = self._run_l1(hidden_states) + if l1_out.shape[1] < 2 * self.intermediate_size: + print(f" WARNING: l1_out shape {l1_out.shape} < expected (N, {2*self.intermediate_size})", flush=True) gate = l1_out[:, :self.intermediate_size] up = l1_out[:, self.intermediate_size:]