[Core] Allow specifying custom Executor (#6557)
This commit is contained in:
@@ -7,12 +7,13 @@ from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import DecodingConfig, ModelConfig
|
||||
from vllm.config import DecodingConfig, EngineConfig, ModelConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_timeout import asyncio_timeout
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.engine.metrics import StatLoggerBase
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster, ray
|
||||
from vllm.inputs import LLMInputs, PromptInputs
|
||||
from vllm.logger import init_logger
|
||||
@@ -385,25 +386,19 @@ class AsyncLLMEngine:
|
||||
self._request_tracker: RequestTracker
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: AsyncEngineArgs,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
|
||||
) -> "AsyncLLMEngine":
|
||||
"""Creates an async LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
if engine_args.engine_use_ray:
|
||||
from vllm.executor import ray_utils
|
||||
ray_utils.assert_ray_available()
|
||||
|
||||
def _get_executor_cls(
|
||||
cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]:
|
||||
distributed_executor_backend = (
|
||||
engine_config.parallel_config.distributed_executor_backend)
|
||||
|
||||
if engine_config.device_config.device_type == "neuron":
|
||||
if isinstance(distributed_executor_backend, type):
|
||||
if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
|
||||
raise TypeError(
|
||||
"distributed_executor_backend must be a subclass of "
|
||||
f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
|
||||
if distributed_executor_backend.uses_ray: # type: ignore
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
executor_class = distributed_executor_backend
|
||||
elif engine_config.device_config.device_type == "neuron":
|
||||
from vllm.executor.neuron_executor import NeuronExecutorAsync
|
||||
executor_class = NeuronExecutorAsync
|
||||
elif engine_config.device_config.device_type == "tpu":
|
||||
@@ -442,9 +437,29 @@ class AsyncLLMEngine:
|
||||
else:
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
executor_class = GPUExecutorAsync
|
||||
return executor_class
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: AsyncEngineArgs,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
|
||||
) -> "AsyncLLMEngine":
|
||||
"""Creates an async LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
if engine_args.engine_use_ray:
|
||||
from vllm.executor import ray_utils
|
||||
ray_utils.assert_ray_available()
|
||||
|
||||
executor_class = cls._get_executor_cls(engine_config)
|
||||
|
||||
# Create the async LLM engine.
|
||||
engine = cls(
|
||||
distributed_executor_backend == "ray",
|
||||
executor_class.uses_ray,
|
||||
engine_args.engine_use_ray,
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
|
||||
Reference in New Issue
Block a user