From aafe2eee12aaaf1a54fc3fe08aa062eb74fbd951 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 31 May 2026 04:16:13 +0000 Subject: [PATCH] CRITICAL FIX: FP4 LUT was 4x too large! E2M1 magnitudes are [0, 0.5, 1, 1.5, 2, 3, 4, 6] NOT [0, 2, 3, 4, 6, 8, 12, 24]. The old LUT was 4x the correct values, causing every NVFP4 dequantized weight to be 4x too large. This compounded across 61 layers, causing the residual stream to explode and producing gibberish output. This is the root cause of the residual growth and incoherent generation. --- single_shot_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/single_shot_inference.py b/single_shot_inference.py index 8ca6c526..5d61e74c 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -53,7 +53,7 @@ NUM_GPUS = 8 # NVFP4 dequantization — matches checkpoint format exactly # ===================================================================== -FP4_LUT = torch.tensor([0., 2., 3., 4., 6., 8., 12., 24.]) +FP4_LUT = torch.tensor([0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]) # E2M1 magnitudes def dequant_nvfp4_weight(weight, weight_scale, weight_scale_2): """Dequantize NVFP4 weight to BF16.