[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257)

This commit is contained in:
Li, Jiang
2024-09-12 00:46:46 +08:00
committed by GitHub
parent 3b7fea770f
commit 0b952af458
18 changed files with 686 additions and 43 deletions

View File

@@ -5,7 +5,8 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
import torch
import vllm.envs as envs
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler, WorkerMonitor)
@@ -60,6 +61,8 @@ class CPUExecutor(ExecutorBase):
self.cache_config = _verify_and_get_cache_config(self.cache_config)
self.scheduler_config = _verify_and_get_scheduler_config(
self.scheduler_config)
self.parallel_config = _verify_and_get_parallel_config(
self.parallel_config)
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
@@ -359,6 +362,16 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
return config
def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
if (config.distributed_executor_backend is not None
and config.distributed_executor_backend != "mp"):
logger.warning(
"%s is not supported on CPU, fallback to mp distributed executor "
"backend.", config.distributed_executor_backend)
config.distributed_executor_backend = "mp"
return config
def _driver_method_invoker(driver, method: str, *args, **kwargs):
return getattr(driver, method)(*args, **kwargs)