[Speculative decoding] Adding configuration object for speculative decoding (#3706)
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
This commit is contained in:
@@ -328,28 +328,27 @@ class AsyncLLMEngine:
|
||||
) -> "AsyncLLMEngine":
|
||||
"""Creates an async LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_configs = engine_args.create_engine_configs()
|
||||
parallel_config = engine_configs[2]
|
||||
device_config = engine_configs[4]
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
if device_config.device_type == "neuron":
|
||||
if engine_config.device_config.device_type == "neuron":
|
||||
raise NotImplementedError("Neuron is not supported for "
|
||||
"async engine yet.")
|
||||
elif parallel_config.worker_use_ray or engine_args.engine_use_ray:
|
||||
initialize_ray_cluster(parallel_config)
|
||||
elif (engine_config.parallel_config.worker_use_ray
|
||||
or engine_args.engine_use_ray):
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
||||
executor_class = RayGPUExecutorAsync
|
||||
else:
|
||||
assert parallel_config.world_size == 1, (
|
||||
assert engine_config.parallel_config.world_size == 1, (
|
||||
"Ray is required if parallel_config.world_size > 1.")
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
executor_class = GPUExecutorAsync
|
||||
# Create the async LLM engine.
|
||||
engine = cls(
|
||||
parallel_config.worker_use_ray,
|
||||
engine_config.parallel_config.worker_use_ray,
|
||||
engine_args.engine_use_ray,
|
||||
*engine_configs,
|
||||
executor_class,
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
log_requests=not engine_args.disable_log_requests,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
max_log_len=engine_args.max_log_len,
|
||||
|
||||
Reference in New Issue
Block a user