[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -12,6 +12,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config)
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import (LinearBase,
                                               UnquantizedLinearMethod)
@@ -629,10 +631,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):

        return tile_tokens_dim

+    def get_fused_moe_quant_config(
+            self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
+
+        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
+            return None
+
+        if self.mxfp4_backend == Mxfp4Backend.TRITON:
+            w1_scale = layer.w13_precision_config
+            w2_scale = layer.w2_precision_config
+        else:
+            w1_scale = layer.w13_weight_scale
+            w2_scale = layer.w2_weight_scale
+
+        return mxfp4_w4a4_moe_quant_config(
+            w1_bias=layer.w13_bias,
+            w2_bias=layer.w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+        )
+
    def select_gemm_impl(
        self,
        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
        layer: torch.nn.Module,
    ) -> mk.FusedMoEPermuteExpertsUnpermute:
        if (prepare_finalize.activation_format ==
@@ -647,11 +668,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                    "gemm1_alpha": layer.gemm1_alpha,
                    "gemm1_beta": layer.gemm1_beta,
                    "gemm1_clamp_limit": layer.gemm1_clamp_limit,
-                    "w13_bias": layer.w13_bias,
-                    "w2_bias": layer.w2_bias,
+                    # TODO(bnell): part of quant_config
                    "max_capture_size": self.max_capture_size,
                }
-                return TrtLlmGenExperts(moe, **kwargs)
+                assert self.moe_quant_config is not None
+                return TrtLlmGenExperts(self.moe, self.moe_quant_config,
+                                        **kwargs)
            else:
                # Use matmul_ogs from triton_kernels here!
                raise NotImplementedError(
@@ -710,8 +732,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
            activation=activation,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
            apply_router_weight_on_input=apply_router_weight_on_input,
        )

@@ -941,10 +961,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                renormalize=renormalize,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_precision=self.w13_precision_config,
-                w2_precision=self.w2_precision_config,
+                quant_config=self.moe_quant_config,
                apply_router_weight_on_input=apply_router_weight_on_input,
            )
        else: