diff --git a/single_shot_inference.py b/single_shot_inference.py index b5e3a9a7..bd23b8cd 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -1359,9 +1359,7 @@ def main(): se.set_fused_swiglu(True) # EAGERLY process shared expert weights se._ensure_initialized() - # BF16 fallback for shared expert — dequantize NVFP4 weights to BF16 se._use_runtime_gsa = True - se.enable_bf16_fallback() # sets _fused_swiglu=False, pre-materializes BF16 weights se_runners[li] = se if (li+1) % 10 == 0: print(f" Built {li+1}/{n_layers} MoE layers") torch.cuda.empty_cache()