[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257)

This commit is contained in:
Li, Jiang
2024-09-12 00:46:46 +08:00
committed by GitHub
parent 3b7fea770f
commit 0b952af458
18 changed files with 686 additions and 43 deletions

View File

@@ -42,6 +42,13 @@ try:
except Exception:
pass
is_cpu = False
try:
from importlib.metadata import version
is_cpu = "cpu" in version("vllm")
except Exception:
pass
if is_tpu:
# people might install pytorch built with cuda but run on tpu
# so we need to check tpu first
@@ -53,6 +60,9 @@ elif is_cuda:
elif is_rocm:
from .rocm import RocmPlatform
current_platform = RocmPlatform()
elif is_cpu:
from .cpu import CpuPlatform
current_platform = CpuPlatform()
else:
current_platform = UnspecifiedPlatform()

15
vllm/platforms/cpu.py Normal file
View File

@@ -0,0 +1,15 @@
import torch
from .interface import Platform, PlatformEnum
class CpuPlatform(Platform):
_enum = PlatformEnum.CPU
@staticmethod
def get_device_name(device_id: int = 0) -> str:
return "cpu"
@staticmethod
def inference_mode():
return torch.no_grad()

View File

@@ -1,5 +1,5 @@
import enum
from typing import Tuple
from typing import Optional, Tuple
import torch
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
CUDA = enum.auto()
ROCM = enum.auto()
TPU = enum.auto()
CPU = enum.auto()
UNSPECIFIED = enum.auto()
@@ -23,9 +24,12 @@ class Platform:
def is_tpu(self) -> bool:
return self._enum == PlatformEnum.TPU
def is_cpu(self) -> bool:
return self._enum == PlatformEnum.CPU
@staticmethod
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
raise NotImplementedError
def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
return None
@staticmethod
def get_device_name(device_id: int = 0) -> str:

View File

@@ -1,5 +1,3 @@
from typing import Tuple
import torch
from .interface import Platform, PlatformEnum
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
class TpuPlatform(Platform):
_enum = PlatformEnum.TPU
@staticmethod
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
raise RuntimeError("TPU does not have device capability.")
@staticmethod
def inference_mode():
return torch.no_grad()