[Kernels] Isolate modular kernel code from FusedMoEMethodBase subclasses. (#27123)

This commit is contained in:
bnellnm
2025-11-04 08:59:45 -05:00
committed by GitHub
parent e4ee658672
commit 938772af03
16 changed files with 271 additions and 311 deletions

View File

@@ -13,7 +13,7 @@ import vllm.envs as envs
from vllm.distributed.parallel_state import get_dp_group
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
@@ -160,8 +160,8 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
):
return False
if not isinstance(module.quant_method.fused_experts, FusedMoEModularKernel):
# fused_experts could invoke deep_gemm_moe_fp8
if not isinstance(module.quant_method, FusedMoEModularMethod):
# modular kernels could invoke deep_gemm_moe_fp8
return True
mk: FusedMoEModularKernel = module.quant_method.fused_experts