[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257)

2024-09-12 00:46:46 +08:00
parent 3b7fea770f
commit 0b952af458
18 changed files with 686 additions and 43 deletions
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -5,7 +5,8 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 import torch

 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
@@ -60,6 +61,8 @@ class CPUExecutor(ExecutorBase):
        self.cache_config = _verify_and_get_cache_config(self.cache_config)
        self.scheduler_config = _verify_and_get_scheduler_config(
            self.scheduler_config)
+        self.parallel_config = _verify_and_get_parallel_config(
+            self.parallel_config)

        # Multiprocessing-based executor does not support multi-node setting.
        # Since it only works for single node, we can use the loopback address
@@ -359,6 +362,16 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
    return config


+def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
+    if (config.distributed_executor_backend is not None
+            and config.distributed_executor_backend != "mp"):
+        logger.warning(
+            "%s is not supported on CPU, fallback to mp distributed executor "
+            "backend.", config.distributed_executor_backend)
+        config.distributed_executor_backend = "mp"
+    return config
+
+
 def _driver_method_invoker(driver, method: str, *args, **kwargs):
    return getattr(driver, method)(*args, **kwargs)