Rename servers and change port numbers to reduce confusion (#149)

This commit is contained in:
Zhuohan Li
2023-06-17 00:13:02 +08:00
committed by GitHub
parent 311490a720
commit eedb46bf03
10 changed files with 41 additions and 37 deletions

View File

@@ -6,7 +6,7 @@ from cacheflow.logger import init_logger
from cacheflow.outputs import RequestOutput
from cacheflow.sampling_params import SamplingParams
from cacheflow.server.arg_utils import AsyncServerArgs
from cacheflow.server.llm_server import LLMServer
from cacheflow.server.llm_server import LLMEngine
from cacheflow.server.ray_utils import ray, initialize_cluster
logger = init_logger(__name__)
@@ -14,26 +14,26 @@ logger = init_logger(__name__)
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
class AsyncLLMServer:
"""An asynchronous wrapper for LLMServer.
class AsyncLLMEngine:
"""An asynchronous wrapper for LLMEngine.
This class is used to wrap the LLMServer class to make it asynchronous. It
This class is used to wrap the LLMEngine class to make it asynchronous. It
uses asyncio to create a background loop that keeps processing incoming
requests. The LLMServer is kicked by the generate method when there
requests. The LLMEngine is kicked by the generate method when there
are requests in the waiting queue. The generate method yields the outputs
from the LLMServer to the caller.
from the LLMEngine to the caller.
NOTE: For the comprehensive list of arguments, see `LLMServer`.
NOTE: For the comprehensive list of arguments, see `LLMEngine`.
Args:
worker_use_ray: Whether to use Ray for model workers. Required for
distributed execution. Should be the same as
`parallel_config.worker_use_ray`.
server_use_ray: Whether to make LLMServer a Ray actor. If so, the
server_use_ray: Whether to make LLMEngine a Ray actor. If so, the
async frontend will be executed in a separate process as the
model workers.
log_requests: Whether to log the requests.
*args, *kwargs: Arguments for LLMServer.
*args, *kwargs: Arguments for LLMEngine.
"""
def __init__(self, worker_use_ray: bool, server_use_ray: bool,
log_requests: bool = True, *args, **kwargs) -> None:
@@ -41,11 +41,11 @@ class AsyncLLMServer:
self.server_use_ray = server_use_ray
self.log_requests = log_requests
if not self.server_use_ray:
server_class = LLMServer
server_class = LLMEngine
elif self.worker_use_ray:
server_class = ray.remote(num_cpus=0)(LLMServer).remote
server_class = ray.remote(num_cpus=0)(LLMEngine).remote
else:
server_class = ray.remote(num_gpus=1)(LLMServer).remote
server_class = ray.remote(num_gpus=1)(LLMEngine).remote
self.server = server_class(*args, **kwargs)
# Request id -> request output.
self.request_outputs: Dict[str, RequestOutput] = {}
@@ -85,8 +85,8 @@ class AsyncLLMServer:
"""Generate outputs for a request.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMServer and streams the outputs
from the LLMServer to the caller.
request into the waiting queue of the LLMEngine and streams the outputs
from the LLMEngine to the caller.
Args:
prompt: The prompt string. Can be None if prompt_token_ids is
@@ -97,7 +97,7 @@ class AsyncLLMServer:
use the tokenizer to convert the prompts to token IDs.
Yields:
The output `RequestOutput` objects from the LLMServer for the
The output `RequestOutput` objects from the LLMEngine for the
request.
"""
# Preprocess the request.
@@ -200,7 +200,7 @@ class AsyncLLMServer:
self.kicking_request_id = None
@classmethod
def from_server_args(cls, server_args: AsyncServerArgs) -> "AsyncLLMServer":
def from_server_args(cls, server_args: AsyncServerArgs) -> "AsyncLLMEngine":
"""Creates an async LLM server from the server arguments."""
# Create the server configs.
server_configs = server_args.create_server_configs()