single_shot: add sync to catch CUDA errors early
This commit is contained in:
@@ -702,13 +702,14 @@ def main():
|
||||
wt, ws, ws2, isc = get_nvfp4_weight(all_w, pfx, proj)
|
||||
if wt is not None and ws is not None:
|
||||
lin = make_nvfp4_linear(in_f, out_f, dev, wt, ws, ws2, isc)
|
||||
# Don't finalize yet — defer JIT compilation to first forward call
|
||||
# This avoids allocating GPU workspace for all 61*4=244 projections upfront
|
||||
# lin.finalize_weights() # called lazily by Nvfp4Linear.forward()
|
||||
plin[proj] = lin
|
||||
if plin:
|
||||
prod_lins[li] = plin
|
||||
if (li+1) % 10 == 0: print(f" Built Nvfp4Linear {li+1}/{n_layers} layers")
|
||||
if (li+1) % 10 == 0:
|
||||
print(f" Built Nvfp4Linear {li+1}/{n_layers} layers")
|
||||
# Sync to catch errors early
|
||||
torch.cuda.set_device(li % NUM_GPUS)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Routers, MoE, shared experts
|
||||
routers, moe_runners, se_runners = {}, {}, {}
|
||||
|
||||
Reference in New Issue
Block a user