[Feature]: Remove Chunking From FusedMoE (#34086)

Signed-off-by: SouthWest7 <am1ao@qq.com> Signed-off-by: Southwest <1403572259@qq.com> Signed-off-by: southwest <am1ao@qq.com> Signed-off-by: Xinan Miao <1403572259@qq.com> Co-authored-by: SouthWest7 <am1ao@qq.com>
2026-03-13 02:24:38 +08:00
parent c973ecdead
commit 2cdf92228c
28 changed files with 152 additions and 523 deletions
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -68,7 +68,6 @@ class Config:
    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
    fused_experts_type: mk.FusedMoEExperts

-    fused_moe_chunk_size: int | None
    world_size: int

    torch_trace_dir_path: str | None = None
@@ -89,7 +88,6 @@ class Config:
        s += f" K={self.K}\n"
        s += f" topk={self.topks}\n"
        s += f" dtype={self.dtype}\n"
-        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n"
        s += " Quant:\n"
        if self.quant_config is not None:
            s += f"     q_dtype={self.quant_dtype}\n"
@@ -152,11 +150,6 @@ class Config:

        vllm_config.parallel_config.all2all_backend = self.all2all_backend()

-        if self.fused_moe_chunk_size is not None:
-            env_dict.update(
-                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}
-            )
-
        return vllm_config, env_dict

    def is_fp8_block_quantized(self):
@@ -189,10 +182,6 @@ class Config:
        info = expert_info(self.fused_experts_type)
        return info.blocked_quantization_support

-    def is_fe_supports_chunking(self):
-        info = expert_info(self.fused_experts_type)
-        return info.supports_chunking
-
    def supports_expert_map(self):
        info = expert_info(self.fused_experts_type)
        return info.supports_expert_map
@@ -233,10 +222,6 @@ class Config:
            if not self.is_standard_fused_experts():
                return False, "Mismatched format."

-        use_chunking = self.fused_moe_chunk_size is not None
-        if use_chunking and not self.is_fe_supports_chunking():
-            return False, "Chunking not supported."
-
        # Check quantization sanity
        if (
            int(self.is_per_act_token_quant)