single_shot: add sync to catch CUDA errors early

This commit is contained in:
2026-05-31 23:17:46 +00:00
parent 0b35c36d23
commit a66fdf6049

View File

@@ -702,13 +702,14 @@ def main():
wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj)
if wt is not None and ws is not None:
lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc)
# Don't finalize yet — defer JIT compilation to first forward call
# This avoids allocating GPU workspace for all 61*4=244 projections upfront
# lin.finalize_weights() # called lazily by Nvfp4Linear.forward()
plin[proj] = lin
if plin:
prod_lins[li] = plin
if (li+1) % 10 == 0: print(f" Built Nvfp4Linear {li+1}/{n_layers} layers")
if (li+1) % 10 == 0:
print(f" Built Nvfp4Linear {li+1}/{n_layers} layers")
# Sync to catch errors early
torch.cuda.set_device(li % NUM_GPUS)
torch.cuda.synchronize()
# Routers, MoE, shared experts
routers, moe_runners, se_runners = {}, {}, {}