[Feature]: Remove Chunking From FusedMoE (#34086)

Signed-off-by: SouthWest7 <am1ao@qq.com> Signed-off-by: Southwest <1403572259@qq.com> Signed-off-by: southwest <am1ao@qq.com> Signed-off-by: Xinan Miao <1403572259@qq.com> Co-authored-by: SouthWest7 <am1ao@qq.com>
2026-03-13 02:24:38 +08:00
parent c973ecdead
commit 2cdf92228c
28 changed files with 152 additions and 523 deletions
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -84,12 +84,6 @@ def rank_worker(

    set_random_seed(pgi.rank)

-    # sanity check
-    from vllm import envs
-
-    if base_config.fused_moe_chunk_size is not None:
-        assert base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
-
    # get weights to this device
    weights.to_current_device()

@@ -162,7 +156,6 @@ Ns = [1024]
 TOPKs = [4, 1]
 Es = [32]
 DTYPEs = [torch.bfloat16]
-FUSED_MOE_CHUNK_SIZES = [None, 16]


 def is_nyi_config(config: Config) -> bool:
@@ -185,14 +178,13 @@ def generate_valid_test_cases(
    cases = []
    total = 0

-    for k, n, e, dtype, quant_config, combination, chunk_size in product(
+    for k, n, e, dtype, quant_config, combination in product(
        Ks,
        Ns,
        Es,
        DTYPEs,
        MK_QUANT_CONFIGS,
        product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES),
-        FUSED_MOE_CHUNK_SIZES,
    ):
        total = total + 1

@@ -206,7 +198,6 @@ def generate_valid_test_cases(
            quant_config=quant_config,
            prepare_finalize_type=combination[0],
            fused_experts_type=combination[1],
-            fused_moe_chunk_size=chunk_size,
            world_size=world_size,
        )

@@ -234,7 +225,6 @@ def generate_valid_test_cases(
                quant_config,
                combination[0],
                combination[1],
-                chunk_size,
                world_size,
            )
        )
@@ -245,7 +235,7 @@ def generate_valid_test_cases(


@pytest.mark.parametrize(
-    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size",
+    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,world_size",
    generate_valid_test_cases(
        world_size=2, prepare_finalize_types=MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
    ),
@@ -259,7 +249,6 @@ def test_modular_kernel_combinations_multigpu(
    quant_config: TestMoEQuantConfig | None,
    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
    fused_experts_type: mk.FusedMoEExperts,
-    chunk_size: int | None,
    world_size: int,
    pytestconfig,
 ):
@@ -280,7 +269,6 @@ def test_modular_kernel_combinations_multigpu(
        quant_config=quant_config,
        prepare_finalize_type=prepare_finalize_type,
        fused_experts_type=fused_experts_type,
-        fused_moe_chunk_size=chunk_size,
        world_size=world_size,
    )
    verbosity = pytestconfig.getoption("verbose")
@@ -288,7 +276,7 @@ def test_modular_kernel_combinations_multigpu(


@pytest.mark.parametrize(
-    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size",
+    "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,world_size",
    generate_valid_test_cases(
        world_size=1, prepare_finalize_types=MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES
    ),
@@ -301,7 +289,6 @@ def test_modular_kernel_combinations_singlegpu(
    quant_config: TestMoEQuantConfig | None,
    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
    fused_experts_type: mk.FusedMoEExperts,
-    chunk_size: int | None,
    world_size: int,
    pytestconfig,
    workspace_init,
@@ -318,7 +305,6 @@ def test_modular_kernel_combinations_singlegpu(
        quant_config=quant_config,
        prepare_finalize_type=prepare_finalize_type,
        fused_experts_type=fused_experts_type,
-        fused_moe_chunk_size=chunk_size,
        world_size=world_size,
    )