[AMD][ROCm] MoRI EP: a high-performance all2all backend (#28664)
Signed-off-by: Alex Sun <alex.s@amd.com>
This commit is contained in:
@@ -141,7 +141,7 @@ def make_config(args: argparse.Namespace) -> Config:
|
||||
|
||||
quant_config = None
|
||||
if args.quant_dtype is not None:
|
||||
quant_config = FusedMoEQuantConfig(
|
||||
quant_config = FusedMoEQuantConfig.make(
|
||||
quant_dtype=args.quant_dtype,
|
||||
per_act_token_quant=args.per_token_quantized_activations,
|
||||
per_out_ch_quant=args.per_channel_quantized_weights,
|
||||
|
||||
@@ -28,7 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
)
|
||||
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.import_utils import (
|
||||
has_aiter,
|
||||
has_deep_ep,
|
||||
has_deep_gemm,
|
||||
has_mori,
|
||||
has_pplx,
|
||||
)
|
||||
|
||||
from .mk_objects import (
|
||||
TestMoEQuantConfig,
|
||||
@@ -211,6 +217,14 @@ class Config:
|
||||
or info.backend == "deepep_low_latency"
|
||||
)
|
||||
|
||||
def needs_aiter(self):
|
||||
info = expert_info(self.fused_experts_type)
|
||||
return info.needs_aiter
|
||||
|
||||
def needs_mori(self):
|
||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||
return info.backend == "mori"
|
||||
|
||||
def all2all_backend(self):
|
||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||
return info.backend
|
||||
@@ -278,6 +292,10 @@ class Config:
|
||||
return False, "Needs DeepGEMM, but DeepGEMM not available."
|
||||
if self.needs_pplx() and not has_pplx(): # noqa: SIM103
|
||||
return False, "Needs PPLX, but PPLX not available."
|
||||
if self.needs_aiter() and not has_aiter(): # noqa: SIM103
|
||||
return False, "Needs Aiter, but Aiter not available."
|
||||
if self.needs_mori() and not has_mori(): # noqa: SIM103
|
||||
return False, "Needs MoRI, but MoRI not available."
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
@@ -37,7 +37,13 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_supported
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.import_utils import (
|
||||
has_aiter,
|
||||
has_deep_ep,
|
||||
has_deep_gemm,
|
||||
has_mori,
|
||||
has_pplx,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -66,6 +72,7 @@ class ExpertInfo:
|
||||
supports_expert_map: bool
|
||||
needs_matching_quant: bool = False
|
||||
needs_deep_gemm: bool = False
|
||||
needs_aiter: bool = False
|
||||
|
||||
|
||||
PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, PrepareFinalizeInfo] = {}
|
||||
@@ -126,6 +133,7 @@ def register_experts(
|
||||
supports_expert_map: bool,
|
||||
needs_matching_quant: bool = False,
|
||||
needs_deep_gemm: bool = False,
|
||||
needs_aiter: bool = False,
|
||||
):
|
||||
global EXPERT_INFO
|
||||
global MK_FUSED_EXPERT_TYPES
|
||||
@@ -139,6 +147,7 @@ def register_experts(
|
||||
supports_expert_map,
|
||||
needs_matching_quant,
|
||||
needs_deep_gemm,
|
||||
needs_aiter,
|
||||
)
|
||||
|
||||
MK_FUSED_EXPERT_TYPES.append(kind)
|
||||
@@ -218,6 +227,20 @@ if has_deep_ep() and not current_platform.has_device_capability(100):
|
||||
backend="deepep_low_latency",
|
||||
)
|
||||
|
||||
if has_mori():
|
||||
from vllm.model_executor.layers.fused_moe.mori_prepare_finalize import (
|
||||
MoriPrepareAndFinalize,
|
||||
)
|
||||
|
||||
register_prepare_and_finalize(
|
||||
MoriPrepareAndFinalize,
|
||||
standard_format,
|
||||
fp8_types,
|
||||
blocked_quantization_support=True,
|
||||
backend="mori",
|
||||
supports_apply_weight_on_input=False,
|
||||
)
|
||||
|
||||
if has_pplx():
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize,
|
||||
@@ -261,6 +284,25 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
|
||||
)
|
||||
else:
|
||||
FlashInferCutlassMoEPrepareAndFinalize = None
|
||||
FlashInferExperts = None
|
||||
|
||||
|
||||
if has_aiter():
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
|
||||
AiterExperts,
|
||||
)
|
||||
|
||||
register_experts(
|
||||
AiterExperts,
|
||||
standard_format,
|
||||
fp8_types,
|
||||
blocked_quantization_support=True,
|
||||
supports_chunking=True,
|
||||
supports_expert_map=True,
|
||||
needs_aiter=True,
|
||||
)
|
||||
else:
|
||||
AiterExperts = None
|
||||
|
||||
if has_deep_gemm() and is_deep_gemm_supported():
|
||||
register_experts(
|
||||
@@ -316,6 +358,9 @@ if cutlass_fp8_supported():
|
||||
supports_chunking=False,
|
||||
supports_expert_map=False,
|
||||
)
|
||||
else:
|
||||
CutlassBatchedExpertsFp8 = None
|
||||
CutlassExpertsFp8 = None
|
||||
|
||||
if cutlass_fp4_supported():
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4
|
||||
@@ -328,6 +373,8 @@ if cutlass_fp4_supported():
|
||||
supports_chunking=True,
|
||||
supports_expert_map=False,
|
||||
)
|
||||
else:
|
||||
CutlassExpertsFp4 = None
|
||||
|
||||
MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
|
||||
None,
|
||||
|
||||
Reference in New Issue
Block a user