diff --git a/single_shot_inference.py b/single_shot_inference.py index 1a61c82e..d7236163 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -702,13 +702,14 @@ def main(): wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj) if wt is not None and ws is not None: lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc) - # Don't finalize yet — defer JIT compilation to first forward call - # This avoids allocating GPU workspace for all 61*4=244 projections upfront - # lin.finalize_weights() # called lazily by Nvfp4Linear.forward() plin[proj] = lin if plin: prod_lins[li] = plin - if (li+1) % 10 == 0: print(f" Built Nvfp4Linear {li+1}/{n_layers} layers") + if (li+1) % 10 == 0: + print(f" Built Nvfp4Linear {li+1}/{n_layers} layers") + # Sync to catch errors early + torch.cuda.set_device(li % NUM_GPUS) + torch.cuda.synchronize() # Routers, MoE, shared experts routers, moe_runners, se_runners = {}, {}, {}