[XPU][2/N] add support unquantized moe support for xpu (#33659)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
Kunshang Ji
2026-02-04 18:12:25 +08:00
committed by GitHub
parent 4c8d1bf361
commit f79f777803
6 changed files with 139 additions and 34 deletions

View File

@@ -100,6 +100,9 @@ if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
XPUExperts,
)
__all__ += [
"AiterExperts",
@@ -117,6 +120,7 @@ if HAS_TRITON:
"DeepGemmExperts",
"BatchedDeepGemmExperts",
"TritonOrDeepGemmExperts",
"XPUExperts",
]
else:
# Some model classes directly use the custom ops. Add placeholders

View File

@@ -46,7 +46,6 @@ class UnquantizedMoeBackend(Enum):
UNSUPPORTED_BACKEND = [
UnquantizedMoeBackend.FLASHINFER_TRTLLM,
UnquantizedMoeBackend.CPU,
UnquantizedMoeBackend.XPU,
UnquantizedMoeBackend.TPU,
UnquantizedMoeBackend.OOT,
]
@@ -196,4 +195,14 @@ def make_unquantized_moe_kernel(
quant_config=quant_config,
),
)
elif backend == UnquantizedMoeBackend.XPU:
from vllm.model_executor.layers.fused_moe import XPUExperts
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
XPUExperts(
moe_config=moe_config,
quant_config=quant_config,
),
)
return kernel, use_inplace

View File

@@ -40,7 +40,7 @@ from vllm.model_executor.utils import replace_parameter, set_weight_attrs
from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum
if current_platform.is_cuda_alike():
if current_platform.is_cuda_alike() or current_platform.is_xpu():
from .fused_batched_moe import BatchedTritonExperts
from .fused_moe import TritonExperts
else:
@@ -71,7 +71,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
self.kernel: mk.FusedMoEModularKernel | None = None
self._is_monolithic = (
current_platform.is_cpu()
or current_platform.is_xpu()
or self.unquantized_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM
)
@@ -82,8 +81,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
"""Select the monolithic implementation based on platform."""
if current_platform.is_cpu():
return self.forward_monolithic_cpu
elif current_platform.is_xpu():
return self.forward_monolithic_xpu
else:
return self.forward_monolithic_cuda
@@ -256,16 +253,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
)
layer.w13_weight = Parameter(w13_weights_shuffled, requires_grad=False)
layer.w2_weight = Parameter(w2_weights_shuffled, requires_grad=False)
elif self.unquantized_backend == UnquantizedMoeBackend.XPU:
import intel_extension_for_pytorch as ipex
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
self.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
layer.w13_weight,
layer.w2_weight,
use_prepack=True,
experts_start_id=ep_rank_start,
)
elif self.unquantized_backend == UnquantizedMoeBackend.CPU:
from vllm.model_executor.layers.fused_moe import cpu_fused_moe
@@ -297,7 +284,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
else:
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
elif current_platform.is_cuda_alike():
elif current_platform.is_cuda_alike() or current_platform.is_xpu():
self._setup_kernel(
layer=layer,
w13=layer.w13_weight,
@@ -399,20 +386,3 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
layer.apply_router_weight_on_input,
layer.activation,
)
def forward_monolithic_xpu(
self,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
return self.ipex_fusion(
x,
layer.use_grouped_topk,
layer.top_k,
router_logits,
layer.renormalize,
layer.topk_group,
layer.num_expert_group,
custom_routing_function=layer.custom_routing_function,
)

View File

@@ -0,0 +1,120 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEParallelConfig,
)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceNoOP,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
kFp8StaticTensorSym,
)
from vllm.platforms import current_platform
if current_platform.is_xpu():
from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe
class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
@property
def expects_unquantized_inputs(self) -> bool:
return True
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
@staticmethod
def _supports_current_device() -> bool:
return current_platform.is_xpu()
@staticmethod
def _supports_no_act_and_mul() -> bool:
return False
@staticmethod
def _supports_activation(activation: str) -> bool:
return activation in ["silu", "gelu", "swigluoai"]
@staticmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
return True
@staticmethod
def _supports_quant_scheme(
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
# TODO: dispatch based on device.
SUPPORTED_W_A = [
(None, None),
(kFp8StaticTensorSym, None),
]
return (weight_key, activation_key) in SUPPORTED_W_A
def supports_chunking(self) -> bool:
return False
def supports_expert_map(self) -> bool:
return True
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
return TopKWeightAndReduceNoOP()
def workspace_shapes(
self,
M: int,
N: int,
K: int,
topk: int,
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
workspace1 = (0,)
workspace2 = (0,)
output = (M, K)
return (workspace1, workspace2, output)
def apply(
self,
output: torch.Tensor,
hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
global_num_experts: int,
expert_map: torch.Tensor | None,
a1q_scale: torch.Tensor | None,
a2_scale: torch.Tensor | None,
workspace13: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
apply_router_weight_on_input: bool,
):
topk = topk_ids.size(-1)
xpu_fused_moe(
hidden_states=hidden_states,
w13=w1,
w13_scales=a1q_scale,
w13_bias=self.w1_bias,
w2=w2,
w2_scales=a2_scale,
w2_bias=self.w2_bias,
topk_weights=topk_weights,
topk_ids=topk_ids,
n_experts_per_token=topk,
activation=activation,
num_experts=self.moe_config.num_local_experts,
ep_rank=self.moe_config.ep_rank,
ep_size=self.moe_config.ep_size,
output=output,
)
return