[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814)

Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-06-18 02:01:25 +08:00
parent 1f12122b17
commit 728c4c8a06
31 changed files with 1998 additions and 24 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu
+from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu

 logger = init_logger(__name__)

@@ -19,6 +19,7 @@ class _Backend(enum.Enum):
    TORCH_SDPA = enum.auto()
    FLASHINFER = enum.auto()
    PALLAS = enum.auto()
+    IPEX = enum.auto()


@lru_cache(maxsize=None)
@@ -58,12 +59,17 @@ def get_attn_backend(
            ROCmFlashAttentionBackend)
        return ROCmFlashAttentionBackend
    elif backend == _Backend.TORCH_SDPA:
-        # TODO: make XPU backend available here.
        assert is_cpu(), RuntimeError(
            "Torch SDPA backend is only used for the CPU device.")
        logger.info("Using Torch SDPA backend.")
        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
        return TorchSDPABackend
+    elif backend == _Backend.IPEX:
+        assert is_xpu(), RuntimeError(
+            "IPEX attention backend is only used for the XPU device.")
+        logger.info("Using IPEX attention backend.")
+        from vllm.attention.backends.ipex_attn import IpexAttnBackend
+        return IpexAttnBackend
    elif backend == _Backend.FLASHINFER:
        logger.info("Using Flashinfer backend.")
        logger.warning("Eager mode is required for the Flashinfer backend. "
@@ -107,6 +113,11 @@ def which_attn_to_use(
            logger.info("Cannot use %s backend on CPU.", selected_backend)
        return _Backend.TORCH_SDPA

+    if is_xpu():
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        return _Backend.IPEX
+
    if is_tpu():
        if selected_backend != _Backend.PALLAS:
            logger.info("Cannot use %s backend on TPU.", selected_backend)