Add docstrings for LLMServer and related classes and examples (#142)

2023-06-07 18:25:20 +08:00
parent e38074b1e6
commit 4298374265
10 changed files with 212 additions and 18 deletions
--- a/cacheflow/server/ray_utils.py
+++ b/cacheflow/server/ray_utils.py
@@ -14,15 +14,30 @@ DeviceID = Tuple[int, Optional[str], int]  # rank, node resource (node IP), devi
 def initialize_cluster(
    parallel_config: ParallelConfig,
    server_use_ray: bool = False,
-    address: Optional[str] = None,
+    ray_server_address: Optional[str] = None,
 ) -> Tuple[str, List[List[DeviceID]]]:
+    """Initialize the distributed cluster probably with Ray.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        server_use_ray: Whether to use Ray for async server.
+        ray_server_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+
+    Returns:
+        A tuple of (`distributed_init_method`, `all_stage_devices`). The
+        `distributed_init_method` is the address for initializing the
+        distributed backend. `all_stage_devices` includes device IDs for
+        each worker in each pipeline stage. Each device ID is a tuple of
+        (rank, node resource, device id).
+    """
    if parallel_config.worker_use_ray or server_use_ray:
        if ray is None:
            raise ImportError(
                "Ray is not installed. Please install Ray to use distributed "
                "serving.")
        # Connect to a ray cluster.
-        ray.init(address=address)
+        ray.init(address=ray_server_address)

    if not parallel_config.worker_use_ray:
        # Initialize cluster locally.