[XPU][2/N] add support unquantized moe support for xpu (#33659)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
@@ -100,6 +100,9 @@ if HAS_TRITON:
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
XPUExperts,
|
||||
)
|
||||
|
||||
__all__ += [
|
||||
"AiterExperts",
|
||||
@@ -117,6 +120,7 @@ if HAS_TRITON:
|
||||
"DeepGemmExperts",
|
||||
"BatchedDeepGemmExperts",
|
||||
"TritonOrDeepGemmExperts",
|
||||
"XPUExperts",
|
||||
]
|
||||
else:
|
||||
# Some model classes directly use the custom ops. Add placeholders
|
||||
|
||||
@@ -46,7 +46,6 @@ class UnquantizedMoeBackend(Enum):
|
||||
UNSUPPORTED_BACKEND = [
|
||||
UnquantizedMoeBackend.FLASHINFER_TRTLLM,
|
||||
UnquantizedMoeBackend.CPU,
|
||||
UnquantizedMoeBackend.XPU,
|
||||
UnquantizedMoeBackend.TPU,
|
||||
UnquantizedMoeBackend.OOT,
|
||||
]
|
||||
@@ -196,4 +195,14 @@ def make_unquantized_moe_kernel(
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
elif backend == UnquantizedMoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe import XPUExperts
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
XPUExperts(
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
return kernel, use_inplace
|
||||
|
||||
@@ -40,7 +40,7 @@ from vllm.model_executor.utils import replace_parameter, set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.interface import CpuArchEnum
|
||||
|
||||
if current_platform.is_cuda_alike():
|
||||
if current_platform.is_cuda_alike() or current_platform.is_xpu():
|
||||
from .fused_batched_moe import BatchedTritonExperts
|
||||
from .fused_moe import TritonExperts
|
||||
else:
|
||||
@@ -71,7 +71,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
self.kernel: mk.FusedMoEModularKernel | None = None
|
||||
self._is_monolithic = (
|
||||
current_platform.is_cpu()
|
||||
or current_platform.is_xpu()
|
||||
or self.unquantized_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM
|
||||
)
|
||||
|
||||
@@ -82,8 +81,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
"""Select the monolithic implementation based on platform."""
|
||||
if current_platform.is_cpu():
|
||||
return self.forward_monolithic_cpu
|
||||
elif current_platform.is_xpu():
|
||||
return self.forward_monolithic_xpu
|
||||
else:
|
||||
return self.forward_monolithic_cuda
|
||||
|
||||
@@ -256,16 +253,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
)
|
||||
layer.w13_weight = Parameter(w13_weights_shuffled, requires_grad=False)
|
||||
layer.w2_weight = Parameter(w2_weights_shuffled, requires_grad=False)
|
||||
elif self.unquantized_backend == UnquantizedMoeBackend.XPU:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
|
||||
self.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
use_prepack=True,
|
||||
experts_start_id=ep_rank_start,
|
||||
)
|
||||
elif self.unquantized_backend == UnquantizedMoeBackend.CPU:
|
||||
from vllm.model_executor.layers.fused_moe import cpu_fused_moe
|
||||
|
||||
@@ -297,7 +284,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
else:
|
||||
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
elif current_platform.is_cuda_alike():
|
||||
elif current_platform.is_cuda_alike() or current_platform.is_xpu():
|
||||
self._setup_kernel(
|
||||
layer=layer,
|
||||
w13=layer.w13_weight,
|
||||
@@ -399,20 +386,3 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
layer.apply_router_weight_on_input,
|
||||
layer.activation,
|
||||
)
|
||||
|
||||
def forward_monolithic_xpu(
|
||||
self,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
return self.ipex_fusion(
|
||||
x,
|
||||
layer.use_grouped_topk,
|
||||
layer.top_k,
|
||||
router_logits,
|
||||
layer.renormalize,
|
||||
layer.topk_group,
|
||||
layer.num_expert_group,
|
||||
custom_routing_function=layer.custom_routing_function,
|
||||
)
|
||||
|
||||
120
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
Normal file
120
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEParallelConfig,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceNoOP,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
QuantKey,
|
||||
kFp8StaticTensorSym,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_xpu():
|
||||
from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe
|
||||
|
||||
|
||||
class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
@property
|
||||
def expects_unquantized_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def activation_format() -> mk.FusedMoEActivationFormat:
|
||||
return mk.FusedMoEActivationFormat.Standard
|
||||
|
||||
@staticmethod
|
||||
def _supports_current_device() -> bool:
|
||||
return current_platform.is_xpu()
|
||||
|
||||
@staticmethod
|
||||
def _supports_no_act_and_mul() -> bool:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _supports_activation(activation: str) -> bool:
|
||||
return activation in ["silu", "gelu", "swigluoai"]
|
||||
|
||||
@staticmethod
|
||||
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _supports_quant_scheme(
|
||||
weight_key: QuantKey | None,
|
||||
activation_key: QuantKey | None,
|
||||
) -> bool:
|
||||
# TODO: dispatch based on device.
|
||||
SUPPORTED_W_A = [
|
||||
(None, None),
|
||||
(kFp8StaticTensorSym, None),
|
||||
]
|
||||
return (weight_key, activation_key) in SUPPORTED_W_A
|
||||
|
||||
def supports_chunking(self) -> bool:
|
||||
return False
|
||||
|
||||
def supports_expert_map(self) -> bool:
|
||||
return True
|
||||
|
||||
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
|
||||
return TopKWeightAndReduceNoOP()
|
||||
|
||||
def workspace_shapes(
|
||||
self,
|
||||
M: int,
|
||||
N: int,
|
||||
K: int,
|
||||
topk: int,
|
||||
global_num_experts: int,
|
||||
local_num_experts: int,
|
||||
expert_tokens_meta: mk.ExpertTokensMetadata | None,
|
||||
activation: str,
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||
workspace1 = (0,)
|
||||
workspace2 = (0,)
|
||||
output = (M, K)
|
||||
return (workspace1, workspace2, output)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
output: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
activation: str,
|
||||
global_num_experts: int,
|
||||
expert_map: torch.Tensor | None,
|
||||
a1q_scale: torch.Tensor | None,
|
||||
a2_scale: torch.Tensor | None,
|
||||
workspace13: torch.Tensor,
|
||||
workspace2: torch.Tensor,
|
||||
expert_tokens_meta: mk.ExpertTokensMetadata | None,
|
||||
apply_router_weight_on_input: bool,
|
||||
):
|
||||
topk = topk_ids.size(-1)
|
||||
xpu_fused_moe(
|
||||
hidden_states=hidden_states,
|
||||
w13=w1,
|
||||
w13_scales=a1q_scale,
|
||||
w13_bias=self.w1_bias,
|
||||
w2=w2,
|
||||
w2_scales=a2_scale,
|
||||
w2_bias=self.w2_bias,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
n_experts_per_token=topk,
|
||||
activation=activation,
|
||||
num_experts=self.moe_config.num_local_experts,
|
||||
ep_rank=self.moe_config.ep_rank,
|
||||
ep_size=self.moe_config.ep_size,
|
||||
output=output,
|
||||
)
|
||||
return
|
||||
Reference in New Issue
Block a user