[Feature]: Remove Chunking From FusedMoE (#34086)

Signed-off-by: SouthWest7 <am1ao@qq.com>
Signed-off-by: Southwest <1403572259@qq.com>
Signed-off-by: southwest <am1ao@qq.com>
Signed-off-by: Xinan Miao <1403572259@qq.com>
Co-authored-by: SouthWest7 <am1ao@qq.com>
This commit is contained in:
Xinan Miao
2026-03-13 02:24:38 +08:00
committed by GitHub
parent c973ecdead
commit 2cdf92228c
28 changed files with 152 additions and 523 deletions

View File

@@ -68,7 +68,6 @@ class Config:
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
fused_experts_type: mk.FusedMoEExperts
fused_moe_chunk_size: int | None
world_size: int
torch_trace_dir_path: str | None = None
@@ -89,7 +88,6 @@ class Config:
s += f" K={self.K}\n"
s += f" topk={self.topks}\n"
s += f" dtype={self.dtype}\n"
s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n"
s += " Quant:\n"
if self.quant_config is not None:
s += f" q_dtype={self.quant_dtype}\n"
@@ -152,11 +150,6 @@ class Config:
vllm_config.parallel_config.all2all_backend = self.all2all_backend()
if self.fused_moe_chunk_size is not None:
env_dict.update(
{"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}
)
return vllm_config, env_dict
def is_fp8_block_quantized(self):
@@ -189,10 +182,6 @@ class Config:
info = expert_info(self.fused_experts_type)
return info.blocked_quantization_support
def is_fe_supports_chunking(self):
info = expert_info(self.fused_experts_type)
return info.supports_chunking
def supports_expert_map(self):
info = expert_info(self.fused_experts_type)
return info.supports_expert_map
@@ -233,10 +222,6 @@ class Config:
if not self.is_standard_fused_experts():
return False, "Mismatched format."
use_chunking = self.fused_moe_chunk_size is not None
if use_chunking and not self.is_fe_supports_chunking():
return False, "Chunking not supported."
# Check quantization sanity
if (
int(self.is_per_act_token_quant)