From a66fdf6049009a8541b3e7567a4dbac8de7a607b Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 31 May 2026 23:17:46 +0000 Subject: [PATCH] single_shot: add sync to catch CUDA errors early --- single_shot_inference.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/single_shot_inference.py b/single_shot_inference.py index 1a61c82e..d7236163 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -702,13 +702,14 @@ def main(): wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj) if wt is not None and ws is not None: lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc) - # Don't finalize yet — defer JIT compilation to first forward call - # This avoids allocating GPU workspace for all 61*4=244 projections upfront - # lin.finalize_weights() # called lazily by Nvfp4Linear.forward() plin[proj] = lin if plin: prod_lins[li] = plin - if (li+1) % 10 == 0: print(f" Built Nvfp4Linear {li+1}/{n_layers} layers") + if (li+1) % 10 == 0: + print(f" Built Nvfp4Linear {li+1}/{n_layers} layers") + # Sync to catch errors early + torch.cuda.set_device(li % NUM_GPUS) + torch.cuda.synchronize() # Routers, MoE, shared experts routers, moe_runners, se_runners = {}, {}, {}