diff --git a/single_shot_inference.py b/single_shot_inference.py
index 1a61c82e..d7236163 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -702,13 +702,14 @@ def main():
             wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj)
             if wt is not None and ws is not None:
                 lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc)
-                # Don't finalize yet — defer JIT compilation to first forward call
-                # This avoids allocating GPU workspace for all 61*4=244 projections upfront
-                # lin.finalize_weights()  # called lazily by Nvfp4Linear.forward()
                 plin[proj] = lin
         if plin:
             prod_lins[li] = plin
-        if (li+1) % 10 == 0: print(f"  Built Nvfp4Linear {li+1}/{n_layers} layers")
+        if (li+1) % 10 == 0: 
+            print(f"  Built Nvfp4Linear {li+1}/{n_layers} layers")
+            # Sync to catch errors early
+            torch.cuda.set_device(li % NUM_GPUS)
+            torch.cuda.synchronize()
 
     # Routers, MoE, shared experts
     routers, moe_runners, se_runners = {}, {}, {}