diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ee321f241..469ff27a2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1940,7 +1940,7 @@ class TritonExperts(mk.FusedMoEExpertsModular): @staticmethod def _supports_current_device() -> bool: - return current_platform.is_cuda_alike() + return current_platform.is_cuda_alike() or current_platform.is_xpu() @staticmethod def _supports_no_act_and_mul() -> bool: @@ -1959,8 +1959,10 @@ class TritonExperts(mk.FusedMoEExpertsModular): else: is_rocm_on_gfx9 = False - device_supports_fp8 = is_rocm_on_gfx9 or ( - p.is_cuda() and p.has_device_capability((8, 9)) + device_supports_fp8 = ( + is_rocm_on_gfx9 + or (p.is_cuda() and p.has_device_capability((8, 9))) + or p.is_xpu() ) if not device_supports_fp8: diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 0ed159b93..c7b012677 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -94,6 +94,11 @@ def _get_priority_backends( else: _move_to_front(_AVAILABLE_BACKENDS, Fp8MoeBackend.TRITON) + if current_platform.is_xpu(): + # XPU platform supports TritonExperts and XPUExpertsFp8, + # move XPU backend to the front. + _move_to_front(_AVAILABLE_BACKENDS, Fp8MoeBackend.XPU) + return _AVAILABLE_BACKENDS