[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257)
This commit is contained in:
@@ -5,7 +5,8 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
|
||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
@@ -60,6 +61,8 @@ class CPUExecutor(ExecutorBase):
|
||||
self.cache_config = _verify_and_get_cache_config(self.cache_config)
|
||||
self.scheduler_config = _verify_and_get_scheduler_config(
|
||||
self.scheduler_config)
|
||||
self.parallel_config = _verify_and_get_parallel_config(
|
||||
self.parallel_config)
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
@@ -359,6 +362,16 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
|
||||
if (config.distributed_executor_backend is not None
|
||||
and config.distributed_executor_backend != "mp"):
|
||||
logger.warning(
|
||||
"%s is not supported on CPU, fallback to mp distributed executor "
|
||||
"backend.", config.distributed_executor_backend)
|
||||
config.distributed_executor_backend = "mp"
|
||||
return config
|
||||
|
||||
|
||||
def _driver_method_invoker(driver, method: str, *args, **kwargs):
|
||||
return getattr(driver, method)(*args, **kwargs)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user