[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)

This commit is contained in:
Robert Shaw
2026-01-21 08:22:33 -05:00
committed by GitHub
parent e14467be43
commit 42135d6898
82 changed files with 2710 additions and 1563 deletions

View File

@@ -13,6 +13,7 @@ import vllm.envs as envs
from vllm.forward_context import get_forward_context, is_forward_context_available
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
@@ -22,6 +23,9 @@ from vllm.model_executor.layers.fused_moe.utils import (
count_expert_num_tokens,
disable_inplace,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
)
from vllm.utils.math_utils import cdiv
from vllm.v1.worker.ubatching import (
dbo_enabled,
@@ -374,18 +378,51 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
def __init__(
self,
moe_config: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
max_num_tokens: int | None = None,
num_dispatchers: int | None = None,
):
"""
moe_config: MoE layer configuration.
quant_config: Quantization parameters for this experts instance.
"""
self.quant_config = quant_config
if self.activation_format() == FusedMoEActivationFormat.Standard and (
max_num_tokens is not None or num_dispatchers is not None
):
raise ValueError(
"max_num_tokens and num_dispatchers should only be set for "
"BatchedExperts activation format."
)
elif self.activation_format() == FusedMoEActivationFormat.BatchedExperts and (
max_num_tokens is None or num_dispatchers is None
):
raise ValueError(
"max_num_tokens and num_dispatchers must be set for "
"BatchedExperts activation format."
)
@property
self.moe_config = moe_config
self.quant_config = quant_config
self.max_num_tokens = max_num_tokens
self.num_dispatchers = num_dispatchers
@staticmethod
def expects_unquantized_inputs(
moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig
) -> bool:
"""
Whether or not the PrepareFinalize should defer input quantization
in the prepare step. If True, then the Experts kernel will
execute the input quantization itself.
Sample subclasses that override are AITER and FlashInfer CUTLASS.
"""
return False
@staticmethod
@abstractmethod
def activation_formats(
self,
) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
def activation_format() -> FusedMoEActivationFormat:
"""
A property which is a tuple of the input and output activation formats
for the 'apply' method.
@@ -435,6 +472,78 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
return E, M, N, K, topk
#
# Various helpers for registering support for various features.
# Used by the oracle to select a particular kernel for a deployment.
#
@staticmethod
def is_supported_config(
cls: type["FusedMoEPermuteExpertsUnpermute"],
moe_config: FusedMoEConfig,
weight_key: QuantKey | None,
activation_key: QuantKey | None,
activation_format: FusedMoEActivationFormat,
) -> tuple[bool, str | None]:
def _make_reason(reason: str) -> str:
return f"kernel does not support {reason}"
if not cls._supports_current_device():
return False, _make_reason("current device")
elif not (moe_config.is_act_and_mul or cls._supports_no_act_and_mul()):
return False, _make_reason("no act_and_mul MLP layer")
elif not cls._supports_activation(moe_config.activation):
return False, _make_reason(f"{moe_config.activation} activation")
elif not cls._supports_quant_scheme(weight_key, activation_key):
return False, _make_reason("quantization scheme")
elif not cls._supports_parallel_config(moe_config.moe_parallel_config):
return False, _make_reason("parallel config")
elif activation_format != cls.activation_format():
return False, _make_reason(f"{activation_format.value} activation format")
return True, None
@staticmethod
@abstractmethod
def _supports_current_device() -> bool:
"""
Whether the kernel supports the current device type
(compute cability and current platform).
"""
raise NotImplementedError
@staticmethod
@abstractmethod
def _supports_no_act_and_mul() -> bool:
"""
Whether the kernel supports act_and_mul=False, i.e.
non-gated MoE models like Nemotron-Nano.
"""
raise NotImplementedError
@staticmethod
@abstractmethod
def _supports_quant_scheme(
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
raise NotImplementedError
@staticmethod
@abstractmethod
def _supports_activation(activation: str) -> bool:
"""
Whether the kernel supports a particular act function.
"""
raise NotImplementedError
@staticmethod
@abstractmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
"""
Whether the kernel supports deployment in expert parallel.
"""
raise NotImplementedError
#
# Various helpers for accessing quantization parameters from the
# quant_config.
@@ -715,12 +824,12 @@ class FusedMoEModularKernel(torch.nn.Module):
self._post_init_setup()
assert (
prepare_finalize.activation_format == fused_experts.activation_formats[0]
prepare_finalize.activation_format == fused_experts.activation_format()
), (
f"{prepare_finalize.__class__.__name__}."
f"{prepare_finalize.activation_format} == "
f"{fused_experts.__class__.__name__}."
f"{fused_experts.activation_formats[0]}"
f"{fused_experts.activation_format()}"
)
def _post_init_setup(self):