[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814)

Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-06-18 02:01:25 +08:00
parent 1f12122b17
commit 728c4c8a06
31 changed files with 1998 additions and 24 deletions
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,6 +37,15 @@ class SiluAndMul(CustomOp):
        ops.silu_and_mul(out, x)
        return out

+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+

 class GeluAndMul(CustomOp):
    """An activation function for GeGLU.
@@ -71,6 +80,18 @@ class GeluAndMul(CustomOp):
            ops.gelu_tanh_and_mul(out, x)
        return out

+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if self.approximate == "none":
+            ops.gelu_and_mul(out, x)
+        elif self.approximate == "tanh":
+            ops.gelu_tanh_and_mul(out, x)
+        return out
+
    def extra_repr(self) -> str:
        return f'approximate={repr(self.approximate)}'

@@ -90,6 +111,13 @@ class NewGELU(CustomOp):
        ops.gelu_new(out, x)
        return out

+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+

 class FastGELU(CustomOp):

@@ -105,6 +133,13 @@ class FastGELU(CustomOp):
        ops.gelu_fast(out, x)
        return out

+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+

 class ScaledActivation(nn.Module):
    """An activation function with post-scale parameters.