single_shot: add sync to catch CUDA errors early

2026-05-31 23:17:46 +00:00
parent 0b35c36d23
commit a66fdf6049
1 changed files with 5 additions and 4 deletions
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -702,13 +702,14 @@ def main():
            wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj)
            if wt is not None and ws is not None:
                lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc)
-                # Don't finalize yet — defer JIT compilation to first forward call
-                # This avoids allocating GPU workspace for all 61*4=244 projections upfront
-                # lin.finalize_weights()  # called lazily by Nvfp4Linear.forward()
                plin[proj] = lin
        if plin:
            prod_lins[li] = plin
-        if (li+1) % 10 == 0: print(f"  Built Nvfp4Linear {li+1}/{n_layers} layers")
+        if (li+1) % 10 == 0: 
+            print(f"  Built Nvfp4Linear {li+1}/{n_layers} layers")
+            # Sync to catch errors early
+            torch.cuda.set_device(li % NUM_GPUS)
+            torch.cuda.synchronize()

    # Routers, MoE, shared experts
    routers, moe_runners, se_runners = {}, {}, {}