From cf796e37cf9ea55fe6c6cda828614a3ca9f9eebd Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 14 May 2026 14:19:35 +0000 Subject: [PATCH] debug: add weight_scale_2 shape/value logging in weight transform --- src/nvfp4_megamoe_kernel/weight_transform.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/nvfp4_megamoe_kernel/weight_transform.py b/src/nvfp4_megamoe_kernel/weight_transform.py index 6d4e74ad..2c7ed6c2 100644 --- a/src/nvfp4_megamoe_kernel/weight_transform.py +++ b/src/nvfp4_megamoe_kernel/weight_transform.py @@ -75,6 +75,23 @@ def transform_nvfp4_weights_for_mega_moe( l1_weight, l1_weight_scale = l1_tuple l2_weight, l2_weight_scale = l2_tuple + # Debug: verify weight_scale_2 shape and values + if l1_weight_scale_2 is not None: + l1_gs = l1_weight_scale_2 + print(f"[weight_transform] L1 weight_scale_2: shape={l1_gs.shape} dtype={l1_gs.dtype} " + f"min={l1_gs.min().item():.6f} max={l1_gs.max().item():.6f} " + f"numel={l1_gs.numel()}") + # Check if all experts have the same global scale (expected for ModelOpt NVFP4) + if l1_gs.numel() > 1: + unique = l1_gs.flatten().unique() + print(f"[weight_transform] L1 weight_scale_2 unique values: {len(unique)} " + f"samples: {l1_gs.flatten()[:8].tolist()}") + if l2_weight_scale_2 is not None: + l2_gs = l2_weight_scale_2 + print(f"[weight_transform] L2 weight_scale_2: shape={l2_gs.shape} dtype={l2_gs.dtype} " + f"min={l2_gs.min().item():.6f} max={l2_gs.max().item():.6f} " + f"numel={l2_gs.numel()}") + # Fold global scales into block scales # Both L1 and L2 use per-expert global scales (shape (E,1) or (E,)). # The logical_widths branch was wrong: it treated gs as per-projection