Rename servers and change port numbers to reduce confusion (#149)

2023-06-17 00:13:02 +08:00
parent 311490a720
commit eedb46bf03
10 changed files with 41 additions and 37 deletions
--- a/cacheflow/server/async_llm_server.py
+++ b/cacheflow/server/async_llm_server.py
@@ -6,7 +6,7 @@ from cacheflow.logger import init_logger
 from cacheflow.outputs import RequestOutput
 from cacheflow.sampling_params import SamplingParams
 from cacheflow.server.arg_utils import AsyncServerArgs
-from cacheflow.server.llm_server import LLMServer
+from cacheflow.server.llm_server import LLMEngine
 from cacheflow.server.ray_utils import ray, initialize_cluster

 logger = init_logger(__name__)
@@ -14,26 +14,26 @@ logger = init_logger(__name__)
 TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds


-class AsyncLLMServer:
-    """An asynchronous wrapper for LLMServer.
+class AsyncLLMEngine:
+    """An asynchronous wrapper for LLMEngine.

-    This class is used to wrap the LLMServer class to make it asynchronous. It
+    This class is used to wrap the LLMEngine class to make it asynchronous. It
    uses asyncio to create a background loop that keeps processing incoming
-    requests. The LLMServer is kicked by the generate method when there
+    requests. The LLMEngine is kicked by the generate method when there
    are requests in the waiting queue. The generate method yields the outputs
-    from the LLMServer to the caller.
+    from the LLMEngine to the caller.

-    NOTE: For the comprehensive list of arguments, see `LLMServer`.
+    NOTE: For the comprehensive list of arguments, see `LLMEngine`.

    Args:
        worker_use_ray: Whether to use Ray for model workers. Required for
            distributed execution. Should be the same as
            `parallel_config.worker_use_ray`.
-        server_use_ray: Whether to make LLMServer a Ray actor. If so, the
+        server_use_ray: Whether to make LLMEngine a Ray actor. If so, the
            async frontend will be executed in a separate process as the
            model workers.
        log_requests: Whether to log the requests.
-        *args, *kwargs: Arguments for LLMServer.
+        *args, *kwargs: Arguments for LLMEngine.
    """
    def __init__(self, worker_use_ray: bool, server_use_ray: bool,
                 log_requests: bool = True, *args, **kwargs) -> None:
@@ -41,11 +41,11 @@ class AsyncLLMServer:
        self.server_use_ray = server_use_ray
        self.log_requests = log_requests
        if not self.server_use_ray:
-            server_class = LLMServer
+            server_class = LLMEngine
        elif self.worker_use_ray:
-            server_class = ray.remote(num_cpus=0)(LLMServer).remote
+            server_class = ray.remote(num_cpus=0)(LLMEngine).remote
        else:
-            server_class = ray.remote(num_gpus=1)(LLMServer).remote
+            server_class = ray.remote(num_gpus=1)(LLMEngine).remote
        self.server = server_class(*args, **kwargs)
        # Request id -> request output.
        self.request_outputs: Dict[str, RequestOutput] = {}
@@ -85,8 +85,8 @@ class AsyncLLMServer:
        """Generate outputs for a request.

        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMServer and streams the outputs
-        from the LLMServer to the caller.
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.

        Args:
            prompt: The prompt string. Can be None if prompt_token_ids is
@@ -97,7 +97,7 @@ class AsyncLLMServer:
                use the tokenizer to convert the prompts to token IDs.

        Yields:
-            The output `RequestOutput` objects from the LLMServer for the
+            The output `RequestOutput` objects from the LLMEngine for the
            request.
        """
        # Preprocess the request.
@@ -200,7 +200,7 @@ class AsyncLLMServer:
            self.kicking_request_id = None

    @classmethod
-    def from_server_args(cls, server_args: AsyncServerArgs) -> "AsyncLLMServer":
+    def from_server_args(cls, server_args: AsyncServerArgs) -> "AsyncLLMEngine":
        """Creates an async LLM server from the server arguments."""
        # Create the server configs.
        server_configs = server_args.create_server_configs()