diff --git a/single_shot_inference.py b/single_shot_inference.py index a9392184..58ed9fcc 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -294,7 +294,14 @@ def forward_layer(x, w, li, cfg, rope_cos, rope_sin): attn_proj = nvfp4_linear(grouped_flat, ob_w, ob_s, ob_s2) # (1, H) # ---- Residual ---- + # Without mHC, values explode. Add RMSNorm as a fallback. x = x + attn_proj + # Emergency: clip to BF16 range to prevent NaN propagation + x = x.clamp(-65504, 65504) + # Per-layer norm (not in real model — mHC handles this) + x_f = x.float() + rms = x_f.pow(2).mean(-1, keepdim=True).add(1e-6).rsqrt() + x = (x_f * rms).bfloat16() # ---- FFN (shared expert only for baseline) ---- # No separate FFN norm in DSV4 — mHC handles it @@ -320,6 +327,10 @@ def forward_layer(x, w, li, cfg, rope_cos, rope_sin): w[f"{se_pre}.down_proj.weight_scale_2"], ) x = x + ffn_out + x = x.clamp(-65504, 65504) + x_f = x.float() + rms = x_f.pow(2).mean(-1, keepdim=True).add(1e-6).rsqrt() + x = (x_f * rms).bfloat16() # Note: for full model, also need routed experts + scaling else: print(f" L{li}: no shared expert weights, skipping FFN")