Add an option to launch cacheflow without ray (#51)
This commit is contained in:
@@ -1,8 +1,12 @@
|
||||
import argparse
|
||||
from typing import List, Tuple
|
||||
from typing import List, Tuple, Optional
|
||||
import random
|
||||
|
||||
import ray
|
||||
import torch
|
||||
try:
|
||||
import ray
|
||||
except ImportError:
|
||||
ray = None
|
||||
|
||||
from cacheflow.master.scheduler import Scheduler
|
||||
from cacheflow.models import get_memory_analyzer
|
||||
@@ -31,6 +35,7 @@ class Server:
|
||||
all_stage_devices: List[List[DeviceID]],
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
use_ray: bool,
|
||||
collect_stats: bool = False,
|
||||
do_memory_analysis: bool = False,
|
||||
):
|
||||
@@ -38,6 +43,10 @@ class Server:
|
||||
self.num_devices_per_node = num_devices_per_node
|
||||
self.world_size = pipeline_parallel_size * tensor_parallel_size
|
||||
|
||||
if not use_ray:
|
||||
assert self.world_size == 1, (
|
||||
"Only support single GPU without Ray.")
|
||||
|
||||
self.memory_analyzer = get_memory_analyzer(
|
||||
model_name=model,
|
||||
block_size=block_size,
|
||||
@@ -72,6 +81,7 @@ class Server:
|
||||
model_path=model_path,
|
||||
use_dummy_weights=use_dummy_weights,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
use_ray=use_ray,
|
||||
)
|
||||
self.controllers.append(controller)
|
||||
|
||||
@@ -105,11 +115,30 @@ class Server:
|
||||
self.scheduler.swapped)
|
||||
|
||||
|
||||
def initialize_ray_cluster(
|
||||
address: str = 'auto',
|
||||
def initialize_cluster(
|
||||
use_ray: bool = False,
|
||||
address: Optional[str] = None,
|
||||
pipeline_parallel_size: int = 1,
|
||||
tensor_parallel_size: int = 1,
|
||||
) -> Tuple[int, int, str, List[List[DeviceID]]]:
|
||||
# Initialize cluster locally.
|
||||
if not use_ray:
|
||||
assert pipeline_parallel_size * tensor_parallel_size == 1, (
|
||||
"Only support single GPU without Ray.")
|
||||
num_nodes = 1
|
||||
num_devices_per_node = torch.cuda.device_count()
|
||||
port = random.randint(10000, 20000)
|
||||
# We need to setup the distributed init method to make sure
|
||||
# the distributed megatron code (e.g., get world size) works correctly.
|
||||
distributed_init_method = f"tcp://localhost:{port}"
|
||||
all_stage_devices = [[(0, None, 0)]]
|
||||
return (num_nodes, num_devices_per_node, distributed_init_method,
|
||||
all_stage_devices)
|
||||
|
||||
assert ray is not None, (
|
||||
"Ray is not installed. Please install Ray to use distributed "
|
||||
"serving.")
|
||||
|
||||
# Connect to a ray cluster.
|
||||
ray.init(address=address)
|
||||
|
||||
@@ -177,6 +206,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
|
||||
parser.add_argument('--model-path', type=str, default='~/.cacheflow/model_weights',
|
||||
help='model path to download and load the weights')
|
||||
# Parallel arguments
|
||||
parser.add_argument('--use-ray', action='store_true', help='use Ray for distributed serving, will be automatically set when using more than 1 GPU')
|
||||
parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=1, help='number of pipeline stages')
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1, help='number of tensor parallel replicas')
|
||||
# KV cache arguments
|
||||
@@ -190,3 +220,8 @@ def add_server_arguments(parser: argparse.ArgumentParser):
|
||||
parser.add_argument('--max-num-sequences', type=int, default=256, help='maximum number of sequences per iteration')
|
||||
parser.add_argument('--use-dummy-weights', action='store_true', help='use dummy values for model weights')
|
||||
return parser
|
||||
|
||||
def process_server_arguments(args: argparse.Namespace):
|
||||
if args.pipeline_parallel_size * args.tensor_parallel_size > 1:
|
||||
args.use_ray = True
|
||||
return args
|
||||
|
||||
Reference in New Issue
Block a user