[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257)

2024-09-12 00:46:46 +08:00
parent 3b7fea770f
commit 0b952af458
18 changed files with 686 additions and 43 deletions
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@@ -42,6 +42,13 @@ try:
 except Exception:
    pass

+is_cpu = False
+try:
+    from importlib.metadata import version
+    is_cpu = "cpu" in version("vllm")
+except Exception:
+    pass
+
 if is_tpu:
    # people might install pytorch built with cuda but run on tpu
    # so we need to check tpu first
@@ -53,6 +60,9 @@ elif is_cuda:
 elif is_rocm:
    from .rocm import RocmPlatform
    current_platform = RocmPlatform()
+elif is_cpu:
+    from .cpu import CpuPlatform
+    current_platform = CpuPlatform()
 else:
    current_platform = UnspecifiedPlatform()

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -0,0 +1,15 @@
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+
+    @staticmethod
+    def get_device_name(device_id: int = 0) -> str:
+        return "cpu"
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,5 +1,5 @@
 import enum
-from typing import Tuple
+from typing import Optional, Tuple

 import torch

@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
    CUDA = enum.auto()
    ROCM = enum.auto()
    TPU = enum.auto()
+    CPU = enum.auto()
    UNSPECIFIED = enum.auto()


@@ -23,9 +24,12 @@ class Platform:
    def is_tpu(self) -> bool:
        return self._enum == PlatformEnum.TPU

+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        raise NotImplementedError
+    def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
+        return None

    @staticmethod
    def get_device_name(device_id: int = 0) -> str:
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch

 from .interface import Platform, PlatformEnum
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
 class TpuPlatform(Platform):
    _enum = PlatformEnum.TPU

-    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        raise RuntimeError("TPU does not have device capability.")
-
    @staticmethod
    def inference_mode():
        return torch.no_grad()