debug: add weight_scale_2 shape/value logging in weight transform
This commit is contained in:
@@ -75,6 +75,23 @@ def transform_nvfp4_weights_for_mega_moe(
|
||||
l1_weight, l1_weight_scale = l1_tuple
|
||||
l2_weight, l2_weight_scale = l2_tuple
|
||||
|
||||
# Debug: verify weight_scale_2 shape and values
|
||||
if l1_weight_scale_2 is not None:
|
||||
l1_gs = l1_weight_scale_2
|
||||
print(f"[weight_transform] L1 weight_scale_2: shape={l1_gs.shape} dtype={l1_gs.dtype} "
|
||||
f"min={l1_gs.min().item():.6f} max={l1_gs.max().item():.6f} "
|
||||
f"numel={l1_gs.numel()}")
|
||||
# Check if all experts have the same global scale (expected for ModelOpt NVFP4)
|
||||
if l1_gs.numel() > 1:
|
||||
unique = l1_gs.flatten().unique()
|
||||
print(f"[weight_transform] L1 weight_scale_2 unique values: {len(unique)} "
|
||||
f"samples: {l1_gs.flatten()[:8].tolist()}")
|
||||
if l2_weight_scale_2 is not None:
|
||||
l2_gs = l2_weight_scale_2
|
||||
print(f"[weight_transform] L2 weight_scale_2: shape={l2_gs.shape} dtype={l2_gs.dtype} "
|
||||
f"min={l2_gs.min().item():.6f} max={l2_gs.max().item():.6f} "
|
||||
f"numel={l2_gs.numel()}")
|
||||
|
||||
# Fold global scales into block scales
|
||||
# Both L1 and L2 use per-expert global scales (shape (E,1) or (E,)).
|
||||
# The logical_widths branch was wrong: it treated gs as per-projection
|
||||
|
||||
Reference in New Issue
Block a user