Fix activation global scale: don't double-invert input_global_scale_inv
The activation global scale = amax / (6.0 * 448.0). Both the linear kernel and MoE kernel were taking 1.0 / (value that's already the correct gs), inverting it and producing garbage quantization. Linear kernel: input_global_scale_inv IS the gs, so use it directly. MoE kernel: w13_input_scale_orig (after undoing convert inversion) IS the gs, so use it directly.
This commit is contained in:
@@ -84,7 +84,11 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel):
|
|||||||
if hasattr(layer, 'input_global_scale_inv') and layer.input_global_scale_inv is not None:
|
if hasattr(layer, 'input_global_scale_inv') and layer.input_global_scale_inv is not None:
|
||||||
inv = layer.input_global_scale_inv.data.item()
|
inv = layer.input_global_scale_inv.data.item()
|
||||||
if inv != 0:
|
if inv != 0:
|
||||||
activation_global_scale = 1.0 / inv
|
# input_global_scale_inv = 1.0 / input_global_scale
|
||||||
|
# input_global_scale = 1.0 / nvfp4_global_scale (the dequant scale)
|
||||||
|
# So input_global_scale_inv = nvfp4_global_scale = amax / (6.0 * 448.0)
|
||||||
|
# This is exactly what quantize_activation_nvfp4 expects.
|
||||||
|
activation_global_scale = inv
|
||||||
runner._activation_global_scale = activation_global_scale
|
runner._activation_global_scale = activation_global_scale
|
||||||
|
|
||||||
# Register the runner and store the ID (not the runner itself)
|
# Register the runner and store the ID (not the runner itself)
|
||||||
|
|||||||
@@ -183,20 +183,21 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
|
|||||||
self._runner.set_swiglu_limit(float(swiglu_limit))
|
self._runner.set_swiglu_limit(float(swiglu_limit))
|
||||||
|
|
||||||
# Set initial activation global scales from checkpoint input_scale.
|
# Set initial activation global scales from checkpoint input_scale.
|
||||||
# The CuTeDSL runner uses activation_gs = 1.0 / input_scale from the
|
# After undoing the inversion from convert_to_nvfp4_moe_kernel_format,
|
||||||
# checkpoint as the starting value. The warmup step
|
# w13_input_scale_orig = amax / (6.0 * 448.0), which IS the activation
|
||||||
# (compute_activation_global_scales) will override this with an
|
# global scale that quantize_activation_nvfp4 expects.
|
||||||
# empirically computed value before the first inference.
|
# The warmup step (compute_activation_global_scales) will override
|
||||||
|
# this with an empirically computed value before the first inference.
|
||||||
if w13_input_scale_orig is not None:
|
if w13_input_scale_orig is not None:
|
||||||
# input_scale = 448.0 / amax → activation_gs = 1.0 / input_scale = amax / 448.0
|
# w13_input_scale_orig = amax / (6.0 * 448.0) = activation gs
|
||||||
# Mean across experts (they should be similar)
|
# Mean across experts (they should be similar)
|
||||||
mean_l1_gs = float(w13_input_scale_orig.mean().item())
|
mean_l1_gs = float(w13_input_scale_orig.mean().item())
|
||||||
if mean_l1_gs > 0:
|
if mean_l1_gs > 0:
|
||||||
self._runner._l1_activation_global_scale = 1.0 / mean_l1_gs
|
self._runner._l1_activation_global_scale = mean_l1_gs
|
||||||
if w2_input_scale_orig is not None:
|
if w2_input_scale_orig is not None:
|
||||||
mean_l2_gs = float(w2_input_scale_orig.mean().item())
|
mean_l2_gs = float(w2_input_scale_orig.mean().item())
|
||||||
if mean_l2_gs > 0:
|
if mean_l2_gs > 0:
|
||||||
self._runner._l2_activation_global_scale = 1.0 / mean_l2_gs
|
self._runner._l2_activation_global_scale = mean_l2_gs
|
||||||
|
|
||||||
# Note: activation global scale warmup must be done after
|
# Note: activation global scale warmup must be done after
|
||||||
# process_weights_after_loading, before the first inference.
|
# process_weights_after_loading, before the first inference.
|
||||||
|
|||||||
Reference in New Issue
Block a user