[AMD][ROCm] MoRI EP: a high-performance all2all backend (#28664)

Signed-off-by: Alex Sun <alex.s@amd.com>
This commit is contained in:
Alex Sun
2026-01-22 16:33:18 +08:00
committed by GitHub
parent 2b8a38b6d6
commit 49a1262267
16 changed files with 397 additions and 9 deletions

View File

@@ -37,7 +37,13 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import is_deep_gemm_supported
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.import_utils import (
has_aiter,
has_deep_ep,
has_deep_gemm,
has_mori,
has_pplx,
)
@dataclass
@@ -66,6 +72,7 @@ class ExpertInfo:
supports_expert_map: bool
needs_matching_quant: bool = False
needs_deep_gemm: bool = False
needs_aiter: bool = False
PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, PrepareFinalizeInfo] = {}
@@ -126,6 +133,7 @@ def register_experts(
supports_expert_map: bool,
needs_matching_quant: bool = False,
needs_deep_gemm: bool = False,
needs_aiter: bool = False,
):
global EXPERT_INFO
global MK_FUSED_EXPERT_TYPES
@@ -139,6 +147,7 @@ def register_experts(
supports_expert_map,
needs_matching_quant,
needs_deep_gemm,
needs_aiter,
)
MK_FUSED_EXPERT_TYPES.append(kind)
@@ -218,6 +227,20 @@ if has_deep_ep() and not current_platform.has_device_capability(100):
backend="deepep_low_latency",
)
if has_mori():
from vllm.model_executor.layers.fused_moe.mori_prepare_finalize import (
MoriPrepareAndFinalize,
)
register_prepare_and_finalize(
MoriPrepareAndFinalize,
standard_format,
fp8_types,
blocked_quantization_support=True,
backend="mori",
supports_apply_weight_on_input=False,
)
if has_pplx():
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
PplxPrepareAndFinalize,
@@ -261,6 +284,25 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
)
else:
FlashInferCutlassMoEPrepareAndFinalize = None
FlashInferExperts = None
if has_aiter():
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
AiterExperts,
)
register_experts(
AiterExperts,
standard_format,
fp8_types,
blocked_quantization_support=True,
supports_chunking=True,
supports_expert_map=True,
needs_aiter=True,
)
else:
AiterExperts = None
if has_deep_gemm() and is_deep_gemm_supported():
register_experts(
@@ -316,6 +358,9 @@ if cutlass_fp8_supported():
supports_chunking=False,
supports_expert_map=False,
)
else:
CutlassBatchedExpertsFp8 = None
CutlassExpertsFp8 = None
if cutlass_fp4_supported():
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4
@@ -328,6 +373,8 @@ if cutlass_fp4_supported():
supports_chunking=True,
supports_expert_map=False,
)
else:
CutlassExpertsFp4 = None
MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
None,