[core] platform agnostic executor via collective_rpc (#11256)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -18,9 +18,7 @@ from vllm.engine.async_timeout import asyncio_timeout
|
||||
from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
|
||||
from vllm.engine.metrics_types import StatLoggerBase
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
@@ -620,69 +618,9 @@ class AsyncLLMEngine(EngineClient):
|
||||
rt.new_requests_event.set()
|
||||
|
||||
@classmethod
|
||||
def _get_executor_cls(
|
||||
cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
|
||||
distributed_executor_backend = (
|
||||
engine_config.parallel_config.distributed_executor_backend)
|
||||
if isinstance(distributed_executor_backend, type):
|
||||
if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
|
||||
raise TypeError(
|
||||
"distributed_executor_backend must be a subclass of "
|
||||
f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
|
||||
executor_class = distributed_executor_backend
|
||||
elif engine_config.device_config.device_type == "neuron":
|
||||
from vllm.executor.neuron_executor import NeuronExecutorAsync
|
||||
executor_class = NeuronExecutorAsync
|
||||
elif engine_config.device_config.device_type == "tpu":
|
||||
if distributed_executor_backend == "ray":
|
||||
from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
|
||||
executor_class = RayTPUExecutorAsync
|
||||
else:
|
||||
assert distributed_executor_backend is None
|
||||
from vllm.executor.tpu_executor import TPUExecutorAsync
|
||||
executor_class = TPUExecutorAsync
|
||||
elif engine_config.device_config.device_type == "cpu":
|
||||
from vllm.executor.cpu_executor import CPUExecutorAsync
|
||||
executor_class = CPUExecutorAsync
|
||||
elif engine_config.device_config.device_type == "hpu":
|
||||
if distributed_executor_backend == "ray":
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
|
||||
executor_class = RayHPUExecutorAsync
|
||||
else:
|
||||
from vllm.executor.hpu_executor import HPUExecutorAsync
|
||||
executor_class = HPUExecutorAsync
|
||||
elif engine_config.device_config.device_type == "openvino":
|
||||
assert distributed_executor_backend is None, (
|
||||
"Distributed execution is not supported with "
|
||||
"the OpenVINO backend.")
|
||||
from vllm.executor.openvino_executor import OpenVINOExecutorAsync
|
||||
executor_class = OpenVINOExecutorAsync
|
||||
elif engine_config.device_config.device_type == "xpu":
|
||||
if distributed_executor_backend is None:
|
||||
from vllm.executor.xpu_executor import XPUExecutorAsync
|
||||
executor_class = XPUExecutorAsync
|
||||
elif distributed_executor_backend == "ray":
|
||||
from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
|
||||
executor_class = RayXPUExecutorAsync
|
||||
elif distributed_executor_backend == "mp":
|
||||
from vllm.executor.multiproc_xpu_executor import (
|
||||
MultiprocessingXPUExecutorAsync)
|
||||
executor_class = MultiprocessingXPUExecutorAsync
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Not supported distributed execution model on XPU device.")
|
||||
elif distributed_executor_backend == "ray":
|
||||
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
||||
executor_class = RayGPUExecutorAsync
|
||||
elif distributed_executor_backend == "mp":
|
||||
from vllm.executor.multiproc_gpu_executor import (
|
||||
MultiprocessingGPUExecutorAsync)
|
||||
executor_class = MultiprocessingGPUExecutorAsync
|
||||
else:
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
executor_class = GPUExecutorAsync
|
||||
return executor_class
|
||||
def _get_executor_cls(cls,
|
||||
engine_config: VllmConfig) -> Type[ExecutorBase]:
|
||||
return LLMEngine._get_executor_cls(engine_config)
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
@@ -700,9 +638,6 @@ class AsyncLLMEngine(EngineClient):
|
||||
|
||||
executor_class = cls._get_executor_cls(engine_config)
|
||||
|
||||
if executor_class.uses_ray:
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
|
||||
# Create the async LLM engine.
|
||||
engine = cls(
|
||||
vllm_config=engine_config,
|
||||
@@ -1242,23 +1177,12 @@ class AsyncLLMEngine(EngineClient):
|
||||
self.engine.remove_logger(logger_name=logger_name)
|
||||
|
||||
async def start_profile(self) -> None:
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes
|
||||
if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721
|
||||
self.engine.model_executor.start_profile()
|
||||
else:
|
||||
self.engine.model_executor._run_workers("start_profile")
|
||||
self.engine.start_profile()
|
||||
|
||||
async def stop_profile(self) -> None:
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes
|
||||
if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721
|
||||
self.engine.model_executor.stop_profile()
|
||||
else:
|
||||
self.engine.model_executor._run_workers("stop_profile")
|
||||
self.engine.stop_profile()
|
||||
|
||||
async def add_lora(self, lora_request: LoRARequest) -> None:
|
||||
"""Load a new LoRA adapter into the engine for future requests."""
|
||||
self.engine.add_lora(lora_request)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user