[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/tests/kernels/moe/test_batched_deepgemm.py
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -6,6 +6,8 @@ import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import (
+    fp8_w8a8_moe_quant_config)
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
    BatchedPrepareAndFinalize, BatchedTritonExperts)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
@@ -56,13 +58,18 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
        rank=0,
    )

+    quant_config = fp8_w8a8_moe_quant_config(
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        per_act_token_quant=False,
+        block_shape=BLOCK_SIZE,
+    )
+
    # triton (reference)
    triton_experts = BatchedTritonExperts(
        max_num_tokens=max_num_tokens,
        num_dispatchers=1,
-        use_fp8_w8a8=True,
-        per_act_token_quant=False,
-        block_shape=BLOCK_SIZE,
+        quant_config=quant_config,
    )
    mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)

@@ -73,8 +80,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        inplace=False,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
        global_num_experts=E,
    )

@@ -82,8 +87,7 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
    deepgemm_experts = BatchedDeepGemmExperts(
        max_num_tokens=max_num_tokens,
        num_dispatchers=1,
-        block_shape=BLOCK_SIZE,
-        per_act_token_quant=False,
+        quant_config=quant_config,
    )
    mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)

@@ -94,8 +98,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        inplace=False,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
        global_num_experts=E,
    )