[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -12,7 +12,6 @@ import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, current_platform, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe

@@ -22,7 +21,8 @@ from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                          run_modular_kernel)
 from .modular_kernel_tools.mk_objects import (
    MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
-    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info)
+    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, TestMoEQuantConfig,
+    expert_info)
 from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
                                                  parallel_launch_with_config)

@@ -55,7 +55,7 @@ def rank_worker(
    pgi: ProcessGroupInfo,
    vllm_config: VllmConfig,
    cpu_group,
-    config: Config,
+    base_config: Config,
    weights: WeightTensors,
    verbose: bool,
 ):
@@ -63,42 +63,44 @@ def rank_worker(

    # sanity check
    from vllm import envs
-    if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+    if base_config.fused_moe_chunk_size is not None:
+        assert (
+            base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)

    # get weights to this device
    weights.to_current_device()

-    Ms = config.Ms
+    Ms = base_config.Ms
    assert isinstance(Ms, list)
-    TOPKs = config.topks
+    TOPKs = base_config.topks
    assert isinstance(TOPKs, list)

    exceptions = []
    count = 0

    for m, topk in product(Ms, TOPKs):
+        # override m and topk
+        config = copy.deepcopy(base_config)
+        config.Ms = m
+        config.topks = topk
+
        try:
            print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...")
            count = count + 1
-            # override m and topk
-            cfgx = copy.deepcopy(config)
-            cfgx.Ms = m
-            cfgx.topks = topk

            # inputs for rank
-            rank_tensors = RankTensors.make(cfgx, pgi)
+            rank_tensors = RankTensors.make(config, pgi)

            # modular kernel out
-            mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+            mk_out = run_modular_kernel(pgi, vllm_config, config, weights,
                                        rank_tensors)

            with set_current_vllm_config(vllm_config):
-                ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+                ref_out = reference_moe_impl(config, weights, rank_tensors)

            if config.quant_dtype == "nvfp4":
-                atol = 1e-1
-                rtol = 1e-1
+                atol = 1e-1 if config.K < 4096 else 2e-1
+                rtol = 1e-1 if config.K < 4096 else 2e-1
            else:
                atol = 3e-2
                rtol = 3e-2
@@ -132,7 +134,7 @@ Ms = [32, 64]
 # hidden sizes, making this too large will cause fp4 tests to fail.
 # Also needs to be a multiple of 1024 for deep_gemm.
 Ks = [2048]
-Ns = [2048]
+Ns = [1024]
 TOPKs = [4, 1]
 Es = [32]
 DTYPEs = [torch.bfloat16]
@@ -167,7 +169,7 @@ def is_nyi_config(config: Config) -> bool:
@meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: Optional[FusedMoEQuantConfig],
+        quant_config: Optional[TestMoEQuantConfig],
        combination: tuple[mk.FusedMoEPrepareAndFinalize,
                           mk.FusedMoEPermuteExpertsUnpermute],
        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
@@ -208,7 +210,7 @@ def test_modular_kernel_combinations_multigpu(
@pytest.mark.parametrize("world_size", [1])
 def test_modular_kernel_combinations_singlegpu(
        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: Optional[FusedMoEQuantConfig],
+        quant_config: Optional[TestMoEQuantConfig],
        combination: tuple[mk.FusedMoEPrepareAndFinalize,
                           mk.FusedMoEPermuteExpertsUnpermute],
        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):