[XPU][1/N] Deprecate ipex and switch to vllm-xpu-kernels for xpu platform (#33379)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
Kunshang Ji
2026-02-03 14:46:10 +08:00
committed by GitHub
parent bf001da4bf
commit e10604480b
18 changed files with 150 additions and 927 deletions

View File

@@ -129,12 +129,8 @@ class SiluAndMul(CustomOp):
def __init__(self, *, compile_native: bool = True):
super().__init__(compile_native=compile_native)
if current_platform.is_cuda_alike():
if current_platform.is_cuda_alike() or current_platform.is_xpu():
self.op = torch.ops._C.silu_and_mul
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
self.op = ipex_ops.silu_and_mul
elif current_platform.is_cpu():
self._forward_method = self.forward_native
@@ -152,11 +148,7 @@ class SiluAndMul(CustomOp):
return out
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
output_shape = x.shape[:-1] + (d,)
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
self.op(out, x)
return out
return self.forward_cuda(x)
# --8<-- [start:mul_and_silu]
@@ -175,12 +167,8 @@ class MulAndSilu(CustomOp):
def __init__(self):
super().__init__()
if current_platform.is_cuda_alike():
if current_platform.is_cuda_alike() or current_platform.is_xpu():
self.op = torch.ops._C.mul_and_silu
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
self.op = ipex_ops.silu_and_mul
elif current_platform.is_cpu():
self._forward_method = self.forward_native
@@ -196,8 +184,8 @@ class MulAndSilu(CustomOp):
self.op(out, x)
return out
# TODO implement forward_xpu for MulAndSilu
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
return self.forward_cuda(x)
# --8<-- [start:gelu_and_mul_sparse]
@@ -278,7 +266,11 @@ class GeluAndMul(CustomOp):
self.approximate = approximate
if approximate not in ("none", "tanh"):
raise ValueError(f"Unknown approximate mode: {approximate}")
if current_platform.is_cuda_alike() or current_platform.is_cpu():
if (
current_platform.is_cuda_alike()
or current_platform.is_cpu()
or current_platform.is_xpu()
):
if approximate == "none":
self.op = torch.ops._C.gelu_and_mul
elif approximate == "tanh":
@@ -289,13 +281,6 @@ class GeluAndMul(CustomOp):
"with torch.compile. For native implementation, fallback to 'none' "
"approximation. The custom kernel implementation is unaffected."
)
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
if approximate == "none":
self.op = ipex_ops.gelu_and_mul
else:
self.op = ipex_ops.gelu_tanh_and_mul
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
@@ -314,11 +299,7 @@ class GeluAndMul(CustomOp):
return out
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
output_shape = x.shape[:-1] + (d,)
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
self.op(out, x)
return out
return self.forward_cuda(x)
def extra_repr(self) -> str:
return f"approximate={repr(self.approximate)}"
@@ -401,12 +382,12 @@ class NewGELU(CustomOp):
def __init__(self):
super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu():
if (
current_platform.is_cuda_alike()
or current_platform.is_cpu()
or current_platform.is_xpu()
):
self.op = torch.ops._C.gelu_new
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
self.op = ipex_ops.gelu_new
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
@@ -419,7 +400,7 @@ class NewGELU(CustomOp):
return out
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
return self.op(x)
return self.forward_cuda(x)
# --8<-- [start:gelu_fast]
@@ -429,12 +410,12 @@ class FastGELU(CustomOp):
def __init__(self):
super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu():
if (
current_platform.is_cuda_alike()
or current_platform.is_cpu()
or current_platform.is_xpu()
):
self.op = torch.ops._C.gelu_fast
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
self.op = ipex_ops.gelu_fast
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
@@ -446,7 +427,7 @@ class FastGELU(CustomOp):
return out
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
return self.op(x)
return self.forward_cuda(x)
# --8<-- [start:quick_gelu]
@@ -457,12 +438,12 @@ class QuickGELU(CustomOp):
def __init__(self):
super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu():
if (
current_platform.is_cuda_alike()
or current_platform.is_cpu()
or current_platform.is_xpu()
):
self.op = torch.ops._C.gelu_quick
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
self.op = ipex_ops.gelu_quick
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
@@ -474,12 +455,7 @@ class QuickGELU(CustomOp):
return out
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
out = torch.empty_like(x)
self.op(out, x)
return out
# TODO implement forward_xpu for QuickGELU
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
return self.forward_cuda(x)
# --8<-- [start:relu2]