diff --git a/single_shot_inference.py b/single_shot_inference.py index 6f244374..05d05505 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -405,7 +405,9 @@ def forward_layer(X_l, w, li, cfg, rope_cos, rope_sin, attn_out = attn_out.permute(1, 0, 2) # (T, n_h, hd) # -- Inverse RoPE on attention output (paper §2.3.3) -- - attn_out = apply_inverse_rope(attn_out, positions_dev, rope_cos, rope_sin, hd, rd) + # NOTE: disabling for debugging — check if this is causing issues + # attn_out = apply_inverse_rope(attn_out, positions_dev, rope_cos, rope_sin, hd, rd) + attn_out = attn_out # No inverse RoPE for now # -- Output projection: wo_a (grouped BMM) + wo_b (NVFP4) -- # wo_a: grouped linear, (n_h, hd) → (n_groups, o_rank) via BMM