diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-BF16-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-BF16-triton.yaml new file mode 100644 index 000000000..671f1b49e --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-BF16-triton.yaml @@ -0,0 +1,5 @@ +model_name: "Qwen/Qwen3-30B-A3B" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt index 53e2fa8a7..dfa67c76e 100644 --- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt @@ -8,3 +8,4 @@ Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml +Qwen3-30B-A3B-BF16-triton.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml new file mode 100644 index 000000000..fe099f9f1 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml @@ -0,0 +1,7 @@ +model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP16: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-triton.yaml new file mode 100644 index 000000000..657101180 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-triton.yaml @@ -0,0 +1,6 @@ +model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" + diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml new file mode 100644 index 000000000..5f4a76b0a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml @@ -0,0 +1,7 @@ +model_name: "mistralai/Mixtral-8x7B-v0.1" +accuracy_threshold: 0.58 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP16: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-triton.yaml new file mode 100644 index 000000000..886b17616 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-triton.yaml @@ -0,0 +1,5 @@ +model_name: "mistralai/Mixtral-8x7B-v0.1" +accuracy_threshold: 0.58 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-fi-cutlass.yaml new file mode 100644 index 000000000..b15126a45 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-fi-cutlass.yaml @@ -0,0 +1,7 @@ +model_name: "Qwen/Qwen3-30B-A3B" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP16: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-triton.yaml new file mode 100644 index 000000000..82286c0e3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-triton.yaml @@ -0,0 +1,5 @@ +model_name: "Qwen/Qwen3-30B-A3B" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt index 9d86e432e..95e91fb08 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt @@ -11,3 +11,7 @@ Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml +Llama-4-Scout-BF16-fi-cutlass.yaml +Llama-4-Scout-BF16-triton.yaml +Mixtral-8x7B-BF16-fi-cutlass.yaml +Mixtral-8x7B-BF16-triton.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt index 2c25ea2c2..6354deded 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt @@ -11,3 +11,5 @@ Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml Llama-4-Scout-Fp8-ModelOpt-marlin.yaml Llama-4-Scout-Fp8-ModelOpt-triton.yaml +Qwen3-30B-A3B-BF16-fi-cutlass.yaml +Qwen3-30B-A3B-BF16-triton.yaml \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py new file mode 100644 index 000000000..ed182613e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from enum import Enum + +import torch +from torch.nn import Module + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + swap_w13_to_w31, +) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +logger = init_logger(__name__) + + +class UnquantizedMoeBackend(Enum): + FLASHINFER_CUTLASS = "FlashInfer CUTLASS" + AITER = "ROCm AITER" + TRITON = "TRITON" + CPU = "CPU" + XPU = "XPU" + + +# NOTE(zyongye): Unsupported backend means backend +# that is not conform with Modular kernel format. +# We will directly call the kernel for those backend +UNSUPPORTED_BACKEND = [ + UnquantizedMoeBackend.CPU, + UnquantizedMoeBackend.XPU, +] + + +def select_unquantized_moe_backend( + use_ep: bool, + use_dp: bool, +) -> UnquantizedMoeBackend: + """ + Select the primary FP8 MoE backend + Note: Shape-specific fallbacks may still occur at runtime. + """ + + def _make_log_backend(backend: UnquantizedMoeBackend): + return f"Using {backend.value} backend for Unquantized MoE" + + rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS + flashinfer_cutlass_moe_enabled = ( + has_flashinfer_cutlass_fused_moe() + and envs.VLLM_USE_FLASHINFER_MOE_FP16 + and use_ep + and (not use_dp) + and current_platform.get_device_capability()[0] >= 9 + ) + if current_platform.is_rocm(): + if rocm_aiter_moe_enabled: + backend = UnquantizedMoeBackend.AITER + else: + backend = UnquantizedMoeBackend.TRITON + if current_platform.is_cuda(): + if flashinfer_cutlass_moe_enabled: + backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS + else: + if use_ep and (not use_dp): + logger.info_once( + "FlashInfer CUTLASS MoE is available for EP" + " but not enabled, consider setting" + " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", + scope="local", + ) + elif use_dp: + logger.info_once( + "FlashInfer CUTLASS MoE is currently not available for DP.", + scope="local", + ) + backend = UnquantizedMoeBackend.TRITON + if current_platform.is_xpu(): + backend = UnquantizedMoeBackend.XPU + if current_platform.is_cpu(): + backend = UnquantizedMoeBackend.CPU + + logger.info_once(_make_log_backend(backend), scope="local") + return backend + + +def convert_to_unquantized_kernel_format( + unquantized_backend: UnquantizedMoeBackend, + layer: Module, + w13_weight: torch.Tensor | None = None, + w2_weight: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if unquantized_backend == UnquantizedMoeBackend.AITER: + w13_weight, w2_weight = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + + elif unquantized_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS: + # Swap halves to arrange as [w3; w1] (kernel expectation) + w13_weight = swap_w13_to_w31(layer.w13_weight.data) + + return w13_weight, w2_weight + + +def make_unquantized_moe_kernel( + layer: torch.nn.Module, + backend: UnquantizedMoeBackend, + quant_config: FusedMoEQuantConfig, + moe_config: FusedMoEConfig, +) -> tuple[mk.FusedMoEModularKernel | None, bool]: + use_inplace = True + + if backend in UNSUPPORTED_BACKEND: + return None, use_inplace + + if backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + + kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + FlashInferExperts( + out_dtype=layer.params_dtype, + quant_config=quant_config, + tp_rank=moe_config.moe_parallel_config.tp_rank, + tp_size=moe_config.moe_parallel_config.tp_size, + ep_rank=moe_config.moe_parallel_config.ep_rank, + ep_size=moe_config.moe_parallel_config.ep_size, + ), + ) + use_inplace = False + elif backend == UnquantizedMoeBackend.AITER: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, + ) + + kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + AiterExperts(quant_config), + ) + elif backend == UnquantizedMoeBackend.TRITON: + from vllm.model_executor.layers.fused_moe import TritonExperts + + kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(quant_config), + ) + return kernel, use_inplace diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index c31c0223e..40a009e4b 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -4,10 +4,10 @@ import torch import torch.nn.functional as F +from torch.nn import Module import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.config import ( @@ -16,9 +16,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, biased_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, -) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -28,19 +25,15 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, ) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) -from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - AiterExperts, -) -from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( - swap_w13_to_w31, +from vllm.model_executor.layers.fused_moe.oracle.unquantized import ( + UnquantizedMoeBackend, + convert_to_unquantized_kernel_format, + make_unquantized_moe_kernel, + select_unquantized_moe_backend, ) from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts @@ -57,41 +50,13 @@ logger = init_logger(__name__) class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" - # --8<-- [end:unquantized_fused_moe] - def __init__(self, moe: FusedMoEConfig): super().__init__(moe) - - self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS - self.flashinfer_cutlass_moe_enabled = ( - has_flashinfer_cutlass_fused_moe() - and envs.VLLM_USE_FLASHINFER_MOE_FP16 - and self.moe.moe_parallel_config.use_ep - and self.moe.moe_parallel_config.dp_size == 1 - and current_platform.get_device_capability()[0] >= 9 + self.unquantized_backend = select_unquantized_moe_backend( + use_ep=self.moe.moe_parallel_config.use_ep, + use_dp=self.moe.moe_parallel_config.dp_size > 1, ) - if self.flashinfer_cutlass_moe_enabled: - logger.info_once( - "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod" - ) - else: - if ( - self.moe.moe_parallel_config.use_ep - and self.moe.moe_parallel_config.dp_size == 1 - ): - logger.info_once( - "FlashInfer CUTLASS MoE is available for EP" - " but not enabled, consider setting" - " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", - scope="local", - ) - elif self.moe.moe_parallel_config.dp_size > 1: - logger.info_once( - "FlashInfer CUTLASS MoE is currently not available for DP.", - scope="local", - ) + self.kernel: mk.FusedMoEModularKernel | None = None @property def supports_eplb(self) -> bool: @@ -105,7 +70,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> FusedMoEPrepareAndFinalize | None: - if self.rocm_aiter_moe_enabled: + if self.unquantized_backend == UnquantizedMoeBackend.AITER: return None else: return super().maybe_make_prepare_finalize(routing_tables) @@ -197,6 +162,33 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): return weight + def _setup_kernel( + self, + layer: Module, + w13: torch.Tensor, + w2: torch.Tensor, + ) -> None: + # Shuffle weights to runtime format. + w13, w2 = convert_to_unquantized_kernel_format( + self.unquantized_backend, + layer=layer, + w13_weight=w13, + w2_weight=w2, + ) + replace_parameter(layer, "w13_weight", w13) + replace_parameter(layer, "w2_weight", w2) + + # Setup Modular Kernel for TP Case + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + assert self.moe_quant_config is not None + + self.kernel, self.use_inplace = make_unquantized_moe_kernel( + layer=layer, + backend=self.unquantized_backend, + quant_config=self.moe_quant_config, + moe_config=self.moe, + ) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) @@ -204,7 +196,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data) layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data) - if current_platform.is_xpu(): + if self.unquantized_backend == UnquantizedMoeBackend.XPU: import intel_extension_for_pytorch as ipex ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts @@ -214,7 +206,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): use_prepack=True, experts_start_id=ep_rank_start, ) - elif current_platform.is_cpu(): + elif self.unquantized_backend == UnquantizedMoeBackend.CPU: from vllm.model_executor.layers.fused_moe import cpu_fused_moe if current_platform.get_cpu_architecture() == CpuArchEnum.X86: @@ -246,45 +238,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) elif current_platform.is_cuda_alike(): - self.moe_quant_config = self.get_fused_moe_quant_config(layer) - if self.rocm_aiter_moe_enabled: - shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( - layer.w13_weight.data, layer.w2_weight.data - ) - replace_parameter(layer, "w13_weight", shuffled_w13) - replace_parameter(layer, "w2_weight", shuffled_w2) - - self.use_inplace = True - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - AiterExperts(self.moe_quant_config), - shared_experts=None, - ) - - elif self.flashinfer_cutlass_moe_enabled: - self.use_inplace = False - # Swap halves to arrange as [w3; w1] (kernel expectation) - w13_weight = swap_w13_to_w31(layer.w13_weight.data) - replace_parameter(layer, "w13_weight", w13_weight) - - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - FlashInferExperts( - out_dtype=layer.params_dtype, - quant_config=self.moe_quant_config, - tp_rank=self.moe.moe_parallel_config.tp_rank, - tp_size=self.moe.moe_parallel_config.tp_size, - ep_rank=self.moe.moe_parallel_config.ep_rank, - ep_size=self.moe.moe_parallel_config.ep_size, - ), - ) - else: - self.use_inplace = True - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - TritonExperts(self.moe_quant_config), - shared_experts=None, - ) + self._setup_kernel( + layer=layer, + w13=layer.w13_weight, + w2=layer.w2_weight, + ) def apply( self, @@ -316,6 +274,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert self.kernel + topk_weights, topk_ids = router.select_experts( hidden_states=x, router_logits=router_logits,