Rename servers to engines (#152)

2023-06-17 17:25:21 +08:00
parent bab8f3dd0d
commit e5464ee484
15 changed files with 165 additions and 174 deletions
--- a/cacheflow/engine/init.py
+++ b/cacheflow/engine/init.py
--- a/cacheflow/engine/arg_utils.py
+++ b/cacheflow/engine/arg_utils.py
@@ -0,0 +1,135 @@
+import argparse
+import dataclasses
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from cacheflow.config import (CacheConfig, ModelConfig, ParallelConfig,
+                              SchedulerConfig)
+
+
+@dataclass
+class EngineArgs:
+    """Arguments for CacheFlow engine."""
+    model: str
+    download_dir: Optional[str] = None
+    use_np_weights: bool = False
+    use_dummy_weights: bool = False
+    dtype: str = "auto"
+    seed: int = 0
+    worker_use_ray: bool = False
+    pipeline_parallel_size: int = 1
+    tensor_parallel_size: int = 1
+    block_size: int = 16
+    swap_space: int = 4  # GiB
+    gpu_memory_utilization: float = 0.95
+    max_num_batched_tokens: int = 2560
+    max_num_seqs: int = 256
+    disable_log_stats: bool = False
+
+    def __post_init__(self):
+        self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
+
+    @staticmethod
+    def add_cli_args(
+        parser: argparse.ArgumentParser,
+    ) -> argparse.ArgumentParser:
+        """Shared CLI arguments for CacheFlow engine."""
+        # Model arguments
+        parser.add_argument('--model', type=str, default='facebook/opt-125m',
+                            help='name or path of the huggingface model to use')
+        parser.add_argument('--download-dir', type=str,
+                            default=EngineArgs.download_dir,
+                            help='directory to download and load the weights, '
+                                 'default to the default cache dir of '
+                                 'huggingface')
+        parser.add_argument('--use-np-weights', action='store_true',
+                            help='save a numpy copy of model weights for '
+                                 'faster loading. This can increase the disk '
+                                 'usage by up to 2x.')
+        parser.add_argument('--use-dummy-weights', action='store_true',
+                            help='use dummy values for model weights')
+        # TODO(woosuk): Support FP32.
+        parser.add_argument('--dtype', type=str, default=EngineArgs.dtype,
+                            choices=['auto', 'half', 'bfloat16', 'float'],
+                            help='data type for model weights and activations. '
+                                 'The "auto" option will use FP16 precision '
+                                 'for FP32 and FP16 models, and BF16 precision '
+                                 'for BF16 models.')
+        # Parallel arguments
+        parser.add_argument('--worker-use-ray', action='store_true',
+                            help='use Ray for distributed serving, will be '
+                                 'automatically set when using more than 1 GPU')
+        parser.add_argument('--pipeline-parallel-size', '-pp', type=int,
+                            default=EngineArgs.pipeline_parallel_size,
+                            help='number of pipeline stages')
+        parser.add_argument('--tensor-parallel-size', '-tp', type=int,
+                            default=EngineArgs.tensor_parallel_size,
+                            help='number of tensor parallel replicas')
+        # KV cache arguments
+        parser.add_argument('--block-size', type=int,
+                            default=EngineArgs.block_size,
+                            choices=[8, 16, 32],
+                            help='token block size')
+        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+        parser.add_argument('--seed', type=int, default=EngineArgs.seed,
+                            help='random seed')
+        parser.add_argument('--swap-space', type=int,
+                            default=EngineArgs.swap_space,
+                            help='CPU swap space size (GiB) per GPU')
+        parser.add_argument('--gpu-memory-utilization', type=float,
+                            default=EngineArgs.gpu_memory_utilization,
+                            help='the percentage of GPU memory to be used for'
+                                 'the model executor')
+        parser.add_argument('--max-num-batched-tokens', type=int,
+                            default=EngineArgs.max_num_batched_tokens,
+                            help='maximum number of batched tokens per '
+                                 'iteration')
+        parser.add_argument('--max-num-seqs', type=int,
+                            default=EngineArgs.max_num_seqs,
+                            help='maximum number of sequences per iteration')
+        parser.add_argument('--disable-log-stats', action='store_true',
+                            help='disable logging statistics')
+        return parser
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
+        # Get the list of attributes of this dataclass.
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Set the attributes from the parsed arguments.
+        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+        return engine_args
+
+    def create_engine_configs(
+        self,
+    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
+        # Initialize the configs.
+        model_config = ModelConfig(
+            self.model, self.download_dir, self.use_np_weights,
+            self.use_dummy_weights, self.dtype, self.seed)
+        cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
+                                   self.swap_space)
+        parallel_config = ParallelConfig(self.pipeline_parallel_size,
+                                         self.tensor_parallel_size,
+                                         self.worker_use_ray)
+        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
+                                           self.max_num_seqs)
+        return model_config, cache_config, parallel_config, scheduler_config
+
+
+@dataclass
+class AsyncEngineArgs(EngineArgs):
+    """Arguments for asynchronous CacheFlow engine."""
+    engine_use_ray: bool = False
+    disable_log_requests: bool = False
+
+    @staticmethod
+    def add_cli_args(
+        parser: argparse.ArgumentParser,
+    ) -> argparse.ArgumentParser:
+        parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--engine-use-ray', action='store_true',
+                            help='use Ray to start the LLM engine in a '
+                                 'separate process as the server process.')
+        parser.add_argument('--disable-log-requests', action='store_true',
+                            help='disable logging requests')
+        return parser
--- a/cacheflow/engine/async_llm_engine.py
+++ b/cacheflow/engine/async_llm_engine.py
@@ -0,0 +1,218 @@
+import asyncio
+import time
+from typing import Dict, List, Optional
+
+from cacheflow.engine.arg_utils import AsyncEngineArgs
+from cacheflow.engine.llm_engine import LLMEngine
+from cacheflow.engine.ray_utils import initialize_cluster, ray
+from cacheflow.logger import init_logger
+from cacheflow.outputs import RequestOutput
+from cacheflow.sampling_params import SamplingParams
+
+logger = init_logger(__name__)
+
+TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
+
+
+class AsyncLLMEngine:
+    """An asynchronous wrapper for LLMEngine.
+
+    This class is used to wrap the LLMEngine class to make it asynchronous. It
+    uses asyncio to create a background loop that keeps processing incoming
+    requests. The LLMEngine is kicked by the generate method when there
+    are requests in the waiting queue. The generate method yields the outputs
+    from the LLMEngine to the caller.
+
+    NOTE: For the comprehensive list of arguments, see `LLMEngine`.
+
+    Args:
+        worker_use_ray: Whether to use Ray for model workers. Required for
+            distributed execution. Should be the same as
+            `parallel_config.worker_use_ray`.
+        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
+            async frontend will be executed in a separate process as the
+            model workers.
+        log_requests: Whether to log the requests.
+        *args, *kwargs: Arguments for LLMEngine.
+    """
+    def __init__(self, worker_use_ray: bool, engine_use_ray: bool,
+                 log_requests: bool = True, *args, **kwargs) -> None:
+        self.worker_use_ray = worker_use_ray
+        self.engine_use_ray = engine_use_ray
+        self.log_requests = log_requests
+        if not self.engine_use_ray:
+            engine_class = LLMEngine
+        elif self.worker_use_ray:
+            engine_class = ray.remote(num_cpus=0)(LLMEngine).remote
+        else:
+            engine_class = ray.remote(num_gpus=1)(LLMEngine).remote
+        self.engine = engine_class(*args, **kwargs)
+        # Request id -> request output.
+        self.request_outputs: Dict[str, RequestOutput] = {}
+        # Request id -> event to notify that there is new output.
+        self.request_events: Dict[str, asyncio.Event] = {}
+        self.is_engine_running = False
+        self.kicking_request_id: Optional[str] = None
+
+    async def engine_step(self, kicking_request_id: Optional[str] = None):
+        """Kick the engine to process the waiting requests."""
+        self.is_engine_running = True
+        self.kicking_request_id = kicking_request_id
+        if self.engine_use_ray:
+            request_outputs = await self.engine.step.remote()
+        else:
+            # Yield to the event loop to allow other coroutines to run
+            # while is_engine_running is True. This let the engine to add new
+            # requests into the queue.
+            await asyncio.sleep(0)
+            request_outputs = self.engine.step()
+        self.is_engine_running = False
+        self.kicking_request_id = None
+
+        # Notify the waiting coroutines that there are new outputs ready.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            self.request_outputs[request_id] = request_output
+            self.request_events[request_id].set()
+
+    async def generate(
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        request_id: str,
+        prompt_token_ids: Optional[List[int]] = None
+    ) -> RequestOutput:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters of the request.
+            request_id: The unique id of the request.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+
+        Yields:
+            The output `RequestOutput` objects from the LLMEngine for the
+            request.
+        """
+        # Preprocess the request.
+        arrival_time = time.time()
+
+        # Create an event to notify us that there is new output from the
+        # cacheflow engine.
+        request_event = asyncio.Event()
+        self.request_events[request_id] = request_event
+
+        if self.log_requests:
+            logger.info(f"Received request {request_id}: "
+                        f"prompt: {prompt!r}, "
+                        f"sampling params: {sampling_params}, "
+                        f"prompt token ids: {prompt_token_ids}.")
+
+        # Add the request into the cacheflow engine's waiting queue.
+        if self.engine_use_ray:
+            await self.engine.add_request.remote(
+                request_id, prompt, sampling_params,
+                prompt_token_ids=prompt_token_ids,
+                arrival_time=arrival_time)
+        else:
+            self.engine.add_request(
+                request_id, prompt, sampling_params,
+                prompt_token_ids=prompt_token_ids,
+                arrival_time=arrival_time)
+
+        # The cacheflow engine does not have a background loop that keeps
+        # processing incoming requests. Therefore, we need to keep kicking
+        # the engine to process the requests.
+        while True:
+            if request_id not in self.request_events:
+                # The request has been aborted.
+                return
+
+            # Kick the engine if the engine is not running.
+            if not self.is_engine_running:
+                await self.engine_step(request_id)
+
+            # Wait for new output. The group_event will be set in engine_step
+            # when there is new output available for the sequence group.
+            # Added a timeout to prevent deadlock.
+            try:
+                await asyncio.wait_for(request_event.wait(),
+                                       timeout=TIMEOUT_TO_PREVENT_DEADLOCK)
+            except asyncio.TimeoutError:
+                continue
+            # Reset the event to wait for the next output.
+            request_event.clear()
+
+            # Decode and return new outputs.
+            request_output = self.request_outputs[request_id]
+            yield request_output
+
+            # Once finished, release the resources of the sequence group.
+            if request_output.finished():
+                if self.log_requests:
+                    logger.info(f"Finished request {request_id}.")
+
+                del self.request_outputs[request_id]
+                del self.request_events[request_id]
+                # Kick the engine if the engine is not running. This is to
+                # prevent that there are still requests in engine's waiting
+                # queue to be executed.
+                if not self.is_engine_running:
+                    await self.engine_step()
+                break
+
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Abort a submitted request. If the request is finished or not found,
+        this method will be a no-op.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        if request_id not in self.request_events:
+            # The request has already finished or been aborted.
+            return
+
+        if self.log_requests:
+            logger.info(f"Aborted request {request_id}.")
+
+        if self.engine_use_ray:
+            await self.engine.abort_request.remote(request_id)
+        else:
+            self.engine.abort_request(request_id)
+
+        if request_id in self.request_events:
+            del self.request_events[request_id]
+        if request_id in self.request_outputs:
+            del self.request_outputs[request_id]
+
+        # To prevent deadlock when a request is aborted while the engine is
+        # running.
+        if self.kicking_request_id == request_id:
+            self.is_engine_running = False
+            self.kicking_request_id = None
+
+    @classmethod
+    def from_engine_args(cls, engine_args: AsyncEngineArgs) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        # Initialize the cluster.
+        distributed_init_method, devices = initialize_cluster(
+            parallel_config, engine_args.engine_use_ray)
+        # Create the async LLM engine.
+        engine = cls(engine_args.worker_use_ray,
+                     engine_args.engine_use_ray,
+                     not engine_args.disable_log_requests,
+                     *engine_configs,
+                     distributed_init_method, devices,
+                     log_stats=not engine_args.disable_log_stats)
+        return engine
--- a/cacheflow/engine/llm_engine.py
+++ b/cacheflow/engine/llm_engine.py
@@ -0,0 +1,321 @@
+import time
+from typing import Any, List, Optional
+
+from cacheflow.config import (CacheConfig, ModelConfig, ParallelConfig,
+                              SchedulerConfig)
+from cacheflow.core.scheduler import Scheduler
+from cacheflow.engine.arg_utils import EngineArgs
+from cacheflow.engine.ray_utils import DeviceID, initialize_cluster, ray
+from cacheflow.engine.tokenizer_utils import (detokenize_incrementally,
+                                              get_tokenizer)
+from cacheflow.logger import init_logger
+from cacheflow.outputs import RequestOutput
+from cacheflow.sampling_params import SamplingParams
+from cacheflow.sequence import Sequence, SequenceGroup, SequenceStatus
+from cacheflow.utils import Counter
+from cacheflow.worker.worker import Worker
+
+logger = init_logger(__name__)
+
+
+class LLMEngine:
+    """An LLM engine that receives requests and generates texts.
+
+    This is the main class for the CacheFlow LLM engine. It receives requests
+    from clients and generates texts from the LLM. It includes a tokenizer, a
+    language model (possibly distributed across multiple GPUs), and GPU memory
+    space allocated for intermediate states (aka KV cache). This class utilizes
+    iteration-level scheduling and efficient memory management to maximize the
+    serving throughput.
+
+    The `LLM` class wraps this class for offline batched inference and the
+    `AsyncLLMEngine` class wraps this class for online serving.
+
+    NOTE: The config arguments are derived from the `EngineArgs` class. For the
+    comprehensive list of arguments, see `EngineArgs`.
+
+    Args:
+        model_config: The configuration related to the LLM model.
+        cache_config: The configuration related to the KV cache memory
+            management.
+        parallel_config: The configuration related to distributed execution.
+        scheduler_config: The configuration related to the request scheduler.
+        distributed_init_method: The initialization method for distributed
+            execution. See `torch.distributed.init_process_group` for details.
+        stage_devices: The list of devices for each stage. Each stage is a list
+            of (rank, node_resource, device) tuples.
+        log_stats: Whether to log statistics.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        distributed_init_method: str,
+        stage_devices: List[List[DeviceID]],
+        log_stats: bool,
+    ) -> None:
+        logger.info(
+            "Initializing an LLM engine with config: "
+            f"model={model_config.model!r}, "
+            f"dtype={model_config.dtype}, "
+            f"use_dummy_weights={model_config.use_dummy_weights}, "
+            f"download_dir={model_config.download_dir!r}, "
+            f"use_np_weights={model_config.use_np_weights}, "
+            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"seed={model_config.seed})"
+        )
+        # TODO(woosuk): Print more configs in debug mode.
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.log_stats = log_stats
+        self._verify_args()
+
+        self.tokenizer = get_tokenizer(model_config.model)
+        self.seq_counter = Counter()
+
+        # Create the parallel GPU workers.
+        self.workers: List[Worker] = []
+        assert len(stage_devices) == 1, "Only support one stage for now."
+        for rank, node_resource, _ in stage_devices[0]:
+            worker_cls = Worker
+            if self.parallel_config.worker_use_ray:
+                worker_cls = ray.remote(
+                    num_cpus=0,
+                    num_gpus=1,
+                    resources={node_resource: 1e-5},
+                )(worker_cls).remote
+
+            worker = worker_cls(
+                model_config,
+                parallel_config,
+                scheduler_config,
+                rank,
+                distributed_init_method,
+            )
+            self.workers.append(worker)
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+        # Create the scheduler.
+        self.scheduler = Scheduler(scheduler_config, cache_config, log_stats)
+
+    def _verify_args(self) -> None:
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache."""
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers(
+            "profile_num_available_blocks",
+            get_all_outputs=True,
+            block_size=self.cache_config.block_size,
+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+            cpu_swap_space=self.cache_config.swap_space_bytes,
+        )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        # FIXME(woosuk): Change to debug log.
+        logger.info(f'# GPU blocks: {num_gpu_blocks}, '
+                    f'# CPU blocks: {num_cpu_blocks}')
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+
+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        # Initialize the cluster.
+        distributed_init_method, devices = initialize_cluster(parallel_config)
+        # Create the LLM engine.
+        engine = cls(*engine_configs, distributed_init_method, devices,
+                     log_stats=not engine_args.disable_log_stats)
+        return engine
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+    ) -> None:
+        """Add a request to the engine's request pool.
+
+        The request is added to the request pool and will be processed by the
+        scheduler as `engine.step()` is called. The exact scheduling policy is
+        determined by the scheduler.
+
+        Args:
+            request_id: The unique ID of the request.
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters for text generation.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            arrival_time: The arrival time of the request. If None, we use
+                the current time.
+        """
+        if arrival_time is None:
+            arrival_time = time.time()
+        if prompt_token_ids is None:
+            assert prompt is not None
+            prompt_token_ids = self.tokenizer.encode(prompt)
+
+        # Create the sequences.
+        block_size = self.cache_config.block_size
+        seqs: List[Sequence] = []
+        for _ in range(sampling_params.best_of):
+            seq_id = next(self.seq_counter)
+            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+            seqs.append(seq)
+
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id, seqs, sampling_params,
+                                  arrival_time)
+
+        # Add the sequence group to the scheduler.
+        self.scheduler.add_seq_group(seq_group)
+
+    def abort_request(self, request_id: str) -> None:
+        """Aborts a request with the given ID.
+
+        Args:
+            request_id: The ID of the request to abort.
+        """
+        self.scheduler.abort_seq_group(request_id)
+
+    def get_num_unfinished_requests(self) -> int:
+        """Gets the number of unfinished requests."""
+        return self.scheduler.get_num_unfinished_seq_groups()
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests."""
+        return self.scheduler.has_unfinished_seqs()
+
+    def step(self) -> List[RequestOutput]:
+        """Performs one decoding iteration and returns newly generated results.
+
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        if (not seq_group_metadata_list) and scheduler_outputs.is_empty():
+            # Nothing to do.
+            return []
+
+        # Execute the model.
+        output = self._run_workers(
+            "execute_model",
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+            blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+            blocks_to_copy=scheduler_outputs.blocks_to_copy,
+        )
+        # Update the scheduler with the model outputs.
+        seq_groups = self.scheduler.update(output)
+
+        # Decode the sequences.
+        self._decode_sequences(seq_groups)
+        # Stop the sequences that meet the stopping criteria.
+        self._stop_sequences(seq_groups)
+        # Free the finished sequence groups.
+        self.scheduler.free_finished_seq_groups()
+
+        # Create the outputs.
+        request_outputs: List[RequestOutput] = []
+        for seq_group in seq_groups:
+            request_output = RequestOutput.from_seq_group(seq_group)
+            request_outputs.append(request_output)
+        return request_outputs
+
+    def _decode_sequences(self, seq_groups: List[SequenceGroup]) -> None:
+        """Decodes the sequence outputs."""
+        for seq_group in seq_groups:
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                new_token, new_output_text = detokenize_incrementally(
+                    self.tokenizer,
+                    seq.output_tokens,
+                    seq.get_last_token_id(),
+                    skip_special_tokens=True,
+                )
+                seq.output_tokens.append(new_token)
+                seq.output_text = new_output_text
+
+    def _stop_sequences(self, seq_groups: List[SequenceGroup]) -> None:
+        """Stop the finished sequences."""
+        for seq_group in seq_groups:
+            sampling_params = seq_group.sampling_params
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                # Check if the sequence has generated a stop string.
+                stopped = False
+                for stop_str in sampling_params.stop:
+                    if seq.output_text.endswith(stop_str):
+                        # Truncate the output text so that the stop string is
+                        # not included in the output.
+                        seq.output_text = seq.output_text[:-len(stop_str)]
+                        self.scheduler.free_seq(seq,
+                                                SequenceStatus.FINISHED_STOPPED)
+                        stopped = True
+                        break
+                if stopped:
+                    continue
+
+                # Check if the sequence has reached max_tokens.
+                if seq.get_output_len() == sampling_params.max_tokens:
+                    self.scheduler.free_seq(
+                        seq, SequenceStatus.FINISHED_LENGTH_CAPPED)
+                    continue
+                # Check if the sequence has generated the EOS token.
+                if not sampling_params.ignore_eos:
+                    if seq.get_last_token_id() == self.tokenizer.eos_token_id:
+                        self.scheduler.free_seq(seq,
+                                                SequenceStatus.FINISHED_STOPPED)
+                        continue
+
+    def _run_workers(
+        self,
+        method: str,
+        get_all_outputs: bool = False,
+        *args,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        all_outputs = []
+        for worker in self.workers:
+            executor = getattr(worker, method)
+            if self.parallel_config.worker_use_ray:
+                executor = executor.remote
+
+            output = executor(*args, **kwargs)
+            all_outputs.append(output)
+
+        if self.parallel_config.worker_use_ray:
+            all_outputs = ray.get(all_outputs)
+
+        if get_all_outputs:
+            return all_outputs
+
+        # Make sure all workers have the same results.
+        output = all_outputs[0]
+        for other_output in all_outputs[1:]:
+            assert output == other_output
+        return output
--- a/cacheflow/engine/ray_utils.py
+++ b/cacheflow/engine/ray_utils.py
@@ -0,0 +1,107 @@
+import random
+from typing import List, Optional, Tuple
+
+try:
+    import ray
+except ImportError:
+    ray = None
+
+from cacheflow.config import ParallelConfig
+
+DeviceID = Tuple[int, Optional[str], int]  # rank, node resource (node IP), device id
+
+
+def initialize_cluster(
+    parallel_config: ParallelConfig,
+    engine_use_ray: bool = False,
+    ray_address: Optional[str] = None,
+) -> Tuple[str, List[List[DeviceID]]]:
+    """Initialize the distributed cluster probably with Ray.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        engine_use_ray: Whether to use Ray for async engine.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+
+    Returns:
+        A tuple of (`distributed_init_method`, `all_stage_devices`). The
+        `distributed_init_method` is the address for initializing the
+        distributed backend. `all_stage_devices` includes device IDs for
+        each worker in each pipeline stage. Each device ID is a tuple of
+        (rank, node resource, device id).
+    """
+    if parallel_config.worker_use_ray or engine_use_ray:
+        if ray is None:
+            raise ImportError(
+                "Ray is not installed. Please install Ray to use distributed "
+                "serving.")
+        # Connect to a ray cluster.
+        ray.init(address=ray_address)
+
+    if not parallel_config.worker_use_ray:
+        # Initialize cluster locally.
+        port = random.randint(10000, 20000)
+        # We need to setup the distributed init method to make sure
+        # the distributed megatron code (e.g., get world size) works correctly.
+        distributed_init_method = f"tcp://localhost:{port}"
+        all_stage_devices = [[(0, None, 0)]]
+        return distributed_init_method, all_stage_devices
+
+    # Assume we have a uniform cluster that each node has the same number of
+    # GPUs for now.
+    valid_node_resources = []
+    num_devices_per_node = None
+    for node in ray.nodes():
+        if (not node['Alive']) or node['Resources']['GPU'] <= 0:
+            continue
+        if num_devices_per_node is None:
+            num_devices_per_node = node['Resources']['GPU']
+        else:
+            assert num_devices_per_node == node['Resources']['GPU'], (
+                "The number of GPUs per node is not uniform.")
+        for key in node['Resources']:
+            if key.startswith('node:'):
+                valid_node_resources.append(key)
+
+    # Verify the parallel config.
+    num_nodes = len(valid_node_resources)
+    if parallel_config.world_size > num_nodes * num_devices_per_node:
+        raise ValueError(
+            "The number of required GPUs exceeds the total number of "
+            "available GPUs.")
+    if parallel_config.tensor_parallel_size >= num_devices_per_node:
+        if parallel_config.tensor_parallel_size % num_devices_per_node != 0:
+            raise ValueError(
+                "The number of tensor parallelism is not divisible by the "
+                "number of GPUs per node.")
+    else:
+        if num_devices_per_node % parallel_config.tensor_parallel_size != 0:
+            raise ValueError(
+                "The number of GPUs per node is not divisible by the number "
+                "of tensor parallelism.")
+
+    # Assign GPUs to pipeline stages.
+    rank = 0
+    current_node_id = 0
+    current_device_id = 0
+    distributed_init_method = None
+    all_stage_devices = []
+
+    for _ in range(parallel_config.pipeline_parallel_size):
+        stage_devices = []
+        for _ in range(parallel_config.tensor_parallel_size):
+            node_resource = valid_node_resources[current_node_id]
+            stage_devices.append((rank, node_resource, current_device_id))
+            if distributed_init_method is None:
+                ip = node_resource.split("node:")[-1]
+                port = random.randint(10000, 20000)
+                distributed_init_method = f"tcp://{ip}:{port}"
+            rank += 1
+            current_device_id += 1
+            if current_device_id >= num_devices_per_node:
+                current_node_id += 1
+                current_device_id = 0
+        all_stage_devices.append(stage_devices)
+
+    return distributed_init_method, all_stage_devices
--- a/cacheflow/engine/tokenizer_utils.py
+++ b/cacheflow/engine/tokenizer_utils.py
@@ -0,0 +1,87 @@
+from typing import List, Tuple, Union
+
+from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from cacheflow.logger import init_logger
+
+logger = init_logger(__name__)
+
+_MODEL_TYPES_WITH_SLOW_TOKENIZER = []
+
+
+def get_tokenizer(
+    model_name: str,
+    *args,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    """Gets a tokenizer for the given model name via Huggingface."""
+    config = AutoConfig.from_pretrained(model_name)
+    if config.model_type == "llama" and getattr(kwargs, "use_fast", True):
+        # LLaMA fast tokenizer causes protobuf errors in some environments.
+        # However, we found that the below LLaMA fast tokenizer works well in
+        # most environments.
+        model_name = "hf-internal-testing/llama-tokenizer"
+        logger.info(
+            f"Using the LLaMA fast tokenizer in '{model_name}' to avoid "
+            "potential protobuf errors.")
+    elif config.model_type in _MODEL_TYPES_WITH_SLOW_TOKENIZER:
+        if getattr(kwargs, "use_fast", False) == True:
+            raise ValueError(
+                f"Cannot use the fast tokenizer for {config.model_type} due to "
+                "bugs in the fast tokenizer.")
+        logger.info(
+            f"Using the slow tokenizer for {config.model_type} due to bugs in "
+            "the fast tokenizer. This could potentially lead to performance "
+            "degradation.")
+        kwargs["use_fast"] = False
+    return AutoTokenizer.from_pretrained(model_name, *args, **kwargs)
+
+
+def detokenize_incrementally(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    prev_output_tokens: List[str],
+    new_token_id: int,
+    skip_special_tokens: bool,
+) -> Tuple[str, str]:
+    """Detokenizes the new token in conjuction with the previous output tokens.
+
+    NOTE: This function does not update prev_output_tokens.
+
+    Returns:
+        new_token: The new token as a string.
+        output_text: The new output text as a string.
+    """
+    new_token = tokenizer.convert_ids_to_tokens(
+        new_token_id, skip_special_tokens=skip_special_tokens)
+    output_tokens = prev_output_tokens + [new_token]
+
+    # Convert the tokens to a string.
+    # Optimization: If the tokenizer does not have `added_tokens_encoder`,
+    # then we can directly use `convert_tokens_to_string`.
+    if not getattr(tokenizer, "added_tokens_encoder", {}):
+        output_text = tokenizer.convert_tokens_to_string(output_tokens)
+        return new_token, output_text
+
+    # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # NOTE(woosuk): The following code is slow because it runs a for loop over
+    # the output_tokens. In Python, running a for loop over a list can be slow
+    # even when the loop body is very simple.
+    sub_texts = []
+    current_sub_text = []
+    for token in output_tokens:
+        if skip_special_tokens and token in tokenizer.all_special_ids:
+            continue
+        if token in tokenizer.added_tokens_encoder:
+            if current_sub_text:
+                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+                sub_texts.append(sub_text)
+                current_sub_text = []
+            sub_texts.append(token)
+        else:
+            current_sub_text.append(token)
+    if current_sub_text:
+        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+        sub_texts.append(sub_text)
+    output_text = " ".join(sub_texts)
+    return new_token, output_text