[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -7,10 +7,12 @@ import itertools
 import pytest
 import torch

+from tests.kernels.moe.utils import fused_moe
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.config import (
+    fp8_w8a8_moe_quant_config)
 from vllm.platforms import current_platform

 if current_platform.get_device_capability() < (9, 0):
@@ -152,11 +154,12 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
            score,
            topk,
            renormalize=False,
-            use_fp8_w8a8=True,  # using fp8
-            per_channel_quant=True,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            block_shape=None,  # Not using block quantization
+            quant_config=fp8_w8a8_moe_quant_config(
+                per_act_token_quant=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=None,  # Not using block quantization
+            ),
        )

    # Check results