From f05ee6cd69138d5aaba11d1d4da43002bdbd0eb1 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Wed, 3 Jun 2026 13:48:44 +0000
Subject: [PATCH] =?UTF-8?q?Revert=20SE=20BF16=20fallback=20=E2=80=94=20pro?=
 =?UTF-8?q?duced=20garbage=20output?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dequantize_nvfp4 path for shared expert made output WORSE (random
Chinese tokens, gibberish) vs NVFP4 GEMM which at least produces 'OK'.
The SE NVFP4 GEMM is working; the dequant scale computation was likely
wrong. Keeping BF16 router gate (which improved output from 'response'
loop to 'OK').
---
 single_shot_inference.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/single_shot_inference.py b/single_shot_inference.py
index b5e3a9a7..bd23b8cd 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -1359,9 +1359,7 @@ def main():
         se.set_fused_swiglu(True)
         # EAGERLY process shared expert weights
         se._ensure_initialized()
-        # BF16 fallback for shared expert — dequantize NVFP4 weights to BF16
         se._use_runtime_gsa = True
-        se.enable_bf16_fallback()  # sets _fused_swiglu=False, pre-materializes BF16 weights
         se_runners[li] = se
         if (li+1) % 10 == 0: print(f"  Built {li+1}/{n_layers} MoE layers")
         torch.cuda.empty_cache()