diff --git a/vllm/kernels/linear/nvfp4/cutedsl.py b/vllm/kernels/linear/nvfp4/cutedsl.py index 367b60a5..55f58b61 100644 --- a/vllm/kernels/linear/nvfp4/cutedsl.py +++ b/vllm/kernels/linear/nvfp4/cutedsl.py @@ -84,7 +84,11 @@ class CuTeDSLNvFp4LinearKernel(NvFp4LinearKernel): if hasattr(layer, 'input_global_scale_inv') and layer.input_global_scale_inv is not None: inv = layer.input_global_scale_inv.data.item() if inv != 0: - activation_global_scale = 1.0 / inv + # input_global_scale_inv = 1.0 / input_global_scale + # input_global_scale = 1.0 / nvfp4_global_scale (the dequant scale) + # So input_global_scale_inv = nvfp4_global_scale = amax / (6.0 * 448.0) + # This is exactly what quantize_activation_nvfp4 expects. + activation_global_scale = inv runner._activation_global_scale = activation_global_scale # Register the runner and store the ID (not the runner itself) diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index d8dfa93e..a2b07ad1 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -183,20 +183,21 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): self._runner.set_swiglu_limit(float(swiglu_limit)) # Set initial activation global scales from checkpoint input_scale. - # The CuTeDSL runner uses activation_gs = 1.0 / input_scale from the - # checkpoint as the starting value. The warmup step - # (compute_activation_global_scales) will override this with an - # empirically computed value before the first inference. + # After undoing the inversion from convert_to_nvfp4_moe_kernel_format, + # w13_input_scale_orig = amax / (6.0 * 448.0), which IS the activation + # global scale that quantize_activation_nvfp4 expects. + # The warmup step (compute_activation_global_scales) will override + # this with an empirically computed value before the first inference. if w13_input_scale_orig is not None: - # input_scale = 448.0 / amax → activation_gs = 1.0 / input_scale = amax / 448.0 + # w13_input_scale_orig = amax / (6.0 * 448.0) = activation gs # Mean across experts (they should be similar) mean_l1_gs = float(w13_input_scale_orig.mean().item()) if mean_l1_gs > 0: - self._runner._l1_activation_global_scale = 1.0 / mean_l1_gs + self._runner._l1_activation_global_scale = mean_l1_gs if w2_input_scale_orig is not None: mean_l2_gs = float(w2_input_scale_orig.mean().item()) if mean_l2_gs > 0: - self._runner._l2_activation_global_scale = 1.0 / mean_l2_gs + self._runner._l2_activation_global_scale = mean_l2_gs # Note: activation global scale warmup must be done after # process_weights_after_loading, before the first inference.