[AMD][ROCm] MoRI EP: a high-performance all2all backend (#28664)

Signed-off-by: Alex Sun <alex.s@amd.com>
2026-01-22 16:33:18 +08:00
parent 2b8a38b6d6
commit 49a1262267
16 changed files with 397 additions and 9 deletions
--- a/tests/kernels/moe/modular_kernel_tools/cli_args.py
+++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py
@@ -141,7 +141,7 @@ def make_config(args: argparse.Namespace) -> Config:

    quant_config = None
    if args.quant_dtype is not None:
-        quant_config = FusedMoEQuantConfig(
+        quant_config = FusedMoEQuantConfig.make(
            quant_dtype=args.quant_dtype,
            per_act_token_quant=args.per_token_quantized_activations,
            per_out_ch_quant=args.per_channel_quantized_weights,
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -28,7 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEQuantConfig,
    RoutingMethodType,
 )
-from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.import_utils import (
+    has_aiter,
+    has_deep_ep,
+    has_deep_gemm,
+    has_mori,
+    has_pplx,
+)

 from .mk_objects import (
    TestMoEQuantConfig,
@@ -211,6 +217,14 @@ class Config:
            or info.backend == "deepep_low_latency"
        )

+    def needs_aiter(self):
+        info = expert_info(self.fused_experts_type)
+        return info.needs_aiter
+
+    def needs_mori(self):
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.backend == "mori"
+
    def all2all_backend(self):
        info = prepare_finalize_info(self.prepare_finalize_type)
        return info.backend
@@ -278,6 +292,10 @@ class Config:
            return False, "Needs DeepGEMM, but DeepGEMM not available."
        if self.needs_pplx() and not has_pplx():  # noqa: SIM103
            return False, "Needs PPLX, but PPLX not available."
+        if self.needs_aiter() and not has_aiter():  # noqa: SIM103
+            return False, "Needs Aiter, but Aiter not available."
+        if self.needs_mori() and not has_mori():  # noqa: SIM103
+            return False, "Needs MoRI, but MoRI not available."

        return True, None

--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -37,7 +37,13 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.import_utils import (
+    has_aiter,
+    has_deep_ep,
+    has_deep_gemm,
+    has_mori,
+    has_pplx,
+)


@dataclass
@@ -66,6 +72,7 @@ class ExpertInfo:
    supports_expert_map: bool
    needs_matching_quant: bool = False
    needs_deep_gemm: bool = False
+    needs_aiter: bool = False


 PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, PrepareFinalizeInfo] = {}
@@ -126,6 +133,7 @@ def register_experts(
    supports_expert_map: bool,
    needs_matching_quant: bool = False,
    needs_deep_gemm: bool = False,
+    needs_aiter: bool = False,
 ):
    global EXPERT_INFO
    global MK_FUSED_EXPERT_TYPES
@@ -139,6 +147,7 @@ def register_experts(
        supports_expert_map,
        needs_matching_quant,
        needs_deep_gemm,
+        needs_aiter,
    )

    MK_FUSED_EXPERT_TYPES.append(kind)
@@ -218,6 +227,20 @@ if has_deep_ep() and not current_platform.has_device_capability(100):
        backend="deepep_low_latency",
    )

+if has_mori():
+    from vllm.model_executor.layers.fused_moe.mori_prepare_finalize import (
+        MoriPrepareAndFinalize,
+    )
+
+    register_prepare_and_finalize(
+        MoriPrepareAndFinalize,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        backend="mori",
+        supports_apply_weight_on_input=False,
+    )
+
 if has_pplx():
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
        PplxPrepareAndFinalize,
@@ -261,6 +284,25 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
    )
 else:
    FlashInferCutlassMoEPrepareAndFinalize = None
+    FlashInferExperts = None
+
+
+if has_aiter():
+    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        AiterExperts,
+    )
+
+    register_experts(
+        AiterExperts,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_aiter=True,
+    )
+else:
+    AiterExperts = None

 if has_deep_gemm() and is_deep_gemm_supported():
    register_experts(
@@ -316,6 +358,9 @@ if cutlass_fp8_supported():
        supports_chunking=False,
        supports_expert_map=False,
    )
+else:
+    CutlassBatchedExpertsFp8 = None
+    CutlassExpertsFp8 = None

 if cutlass_fp4_supported():
    from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4
@@ -328,6 +373,8 @@ if cutlass_fp4_supported():
        supports_chunking=True,
        supports_expert_map=False,
    )
+else:
+    CutlassExpertsFp4 = None

 MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
    None,