From f05ee6cd69138d5aaba11d1d4da43002bdbd0eb1 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 3 Jun 2026 13:48:44 +0000 Subject: [PATCH] =?UTF-8?q?Revert=20SE=20BF16=20fallback=20=E2=80=94=20pro?= =?UTF-8?q?duced=20garbage=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dequantize_nvfp4 path for shared expert made output WORSE (random Chinese tokens, gibberish) vs NVFP4 GEMM which at least produces 'OK'. The SE NVFP4 GEMM is working; the dequant scale computation was likely wrong. Keeping BF16 router gate (which improved output from 'response' loop to 'OK'). --- single_shot_inference.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/single_shot_inference.py b/single_shot_inference.py index b5e3a9a7..bd23b8cd 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -1359,9 +1359,7 @@ def main(): se.set_fused_swiglu(True) # EAGERLY process shared expert weights se._ensure_initialized() - # BF16 fallback for shared expert — dequantize NVFP4 weights to BF16 se._use_runtime_gsa = True - se.enable_bf16_fallback() # sets _fused_swiglu=False, pre-materializes BF16 weights se_runners[li] = se if (li+1) % 10 == 0: print(f" Built {li+1}/{n_layers} MoE layers") torch.cuda.empty_cache()