[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import (
+    fp8_w8a8_moe_quant_config,
+    nvfp4_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
@@ -140,6 +144,12 @@ def bench_run(
        a_fp8_scale: torch.Tensor,
        num_repeats: int,
    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_fp8_scale,
+        )
+
        for _ in range(num_repeats):
            fused_experts(
                a,
@@ -147,10 +157,7 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_fp8_scale,
+                quant_config=quant_config,
            )

    def run_cutlass_moe_fp4(
@@ -172,25 +179,27 @@ def bench_run(
        device: torch.device,
        num_repeats: int,
    ):
+        quant_config = nvfp4_moe_quant_config(
+            a1_gscale=a1_gs,
+            a2_gscale=a2_gs,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+            g1_alphas=w1_gs,
+            g2_alphas=w2_gs,
+        )
        for _ in range(num_repeats):
            with nvtx.annotate("cutlass_moe_fp4", color="green"):
                cutlass_moe_fp4(
                    a=a,
-                    a1_gscale=a1_gs,
-                    a2_gscale=a2_gs,
                    w1_fp4=w1_fp4,
-                    w1_blockscale=w1_blockscale,
-                    w1_alphas=w1_gs,
                    w2_fp4=w2_fp4,
-                    w2_blockscale=w2_blockscale,
-                    w2_alphas=w2_gs,
                    topk_weights=topk_weights,
                    topk_ids=topk_ids,
                    m=m,
                    n=n,
                    k=k,
                    e=num_experts,
-                    device=device,
+                    quant_config=quant_config,
                )

    def run_cutlass_from_graph(
@@ -211,26 +220,29 @@ def bench_run(
        e: int,
        device: torch.device,
    ):
+        quant_config = nvfp4_moe_quant_config(
+            a1_gscale=a1_gs,
+            a2_gscale=a2_gs,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+            g1_alphas=w1_gs,
+            g2_alphas=w2_gs,
+        )
+
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
            return cutlass_moe_fp4(
                a=a,
-                a1_gscale=a1_gs,
                w1_fp4=w1_fp4,
-                w1_blockscale=w1_blockscale,
-                w1_alphas=w1_alphas,
-                a2_gscale=a2_gs,
                w2_fp4=w2_fp4,
-                w2_blockscale=w2_blockscale,
-                w2_alphas=w2_alphas,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                m=m,
                n=n,
                k=k,
                e=num_experts,
-                device=device,
+                quant_config=quant_config,
            )

    def run_triton_from_graph(
@@ -246,16 +258,18 @@ def bench_run(
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
+            quant_config = fp8_w8a8_moe_quant_config(
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
            return fused_experts(
                a,
                w1,
                w2,
                topk_weights,
                topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_fp8_scale,
+                quant_config=quant_config,
            )

    def replay_graph(graph, num_repeats):
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
@@ -96,6 +97,11 @@ def bench_run(
        a_scale: torch.Tensor,
        num_repeats: int,
    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_scale,
+        )
        for _ in range(num_repeats):
            fused_experts(
                a,
@@ -103,10 +109,7 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_scale,
+                quant_config=quant_config,
            )

    def run_cutlass_moe(
@@ -125,6 +128,12 @@ def bench_run(
        per_act_token: bool,
        num_repeats: int,
    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            per_act_token_quant=per_act_token,
+        )
+
        for _ in range(num_repeats):
            cutlass_moe_fp8(
                a,
@@ -132,14 +141,11 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                w1_scale,
-                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
-                per_act_token,
-                a1_scale=None,
+                quant_config=quant_config,
            )

    def run_cutlass_from_graph(
@@ -156,6 +162,12 @@ def bench_run(
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            per_act_token_quant=per_act_token,
+        )
+
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
@@ -165,14 +177,11 @@ def bench_run(
                w2_q,
                topk_weights,
                topk_ids,
-                w1_scale,
-                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
-                per_act_token,
-                a1_scale=None,
+                quant_config=quant_config,
            )

    def run_triton_from_graph(
@@ -185,6 +194,11 @@ def bench_run(
        w2_scale: torch.Tensor,
        a_scale: torch.Tensor,
    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a_scale,
+        )
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
@@ -194,10 +208,7 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_scale,
+                quant_config=quant_config,
            )

    def replay_graph(graph, num_repeats):
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -14,6 +14,10 @@ import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm

+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    _get_config_dtype_str,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
@@ -134,43 +138,36 @@ def benchmark_config(
    def run():
        from vllm.model_executor.layers.fused_moe import override_config

+        if use_fp8_w8a8:
+            quant_dtype = torch.float8_e4m3fn
+        elif use_int8_w8a16:
+            quant_dtype = torch.int8
+        else:
+            quant_dtype = None
+
+        quant_config = FusedMoEQuantConfig.make(
+            quant_dtype=quant_dtype,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_quant_shape,
+        )
+
        with override_config(config):
-            if use_deep_gemm:
-                topk_weights, topk_ids, token_expert_indices = fused_topk(
-                    x, input_gating, topk, False
-                )
-                return fused_experts(
-                    x,
-                    w1,
-                    w2,
-                    topk_weights,
-                    topk_ids,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                    allow_deep_gemm=True,
-                )
-            else:
-                fused_moe(
-                    x,
-                    w1,
-                    w2,
-                    input_gating,
-                    topk,
-                    renormalize=True,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    use_int8_w8a16=use_int8_w8a16,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                )
+            topk_weights, topk_ids, token_expert_indices = fused_topk(
+                x, input_gating, topk, renormalize=not use_deep_gemm
+            )
+            return fused_experts(
+                x,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                inplace=True,
+                quant_config=quant_config,
+                allow_deep_gemm=use_deep_gemm,
+            )

    # JIT compilation & warmup
    run()
@@ -414,7 +411,7 @@ class BenchmarkWorker:
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
-        dtype_str = get_config_dtype_str(
+        dtype_str = _get_config_dtype_str(
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -547,7 +544,7 @@ def save_configs(
    block_quant_shape: list[int],
    save_dir: str,
 ) -> None:
-    dtype_str = get_config_dtype_str(
+    dtype_str = _get_config_dtype_str(
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
    )