Support beam search & parallel generation (#7)

2023-03-10 09:58:21 -08:00
parent 04e5acc08e
commit 1a7eb7da61
16 changed files with 660 additions and 161 deletions
--- a/cacheflow/worker/controller.py
+++ b/cacheflow/worker/controller.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Union

 from cacheflow.master.scheduler import Scheduler
+from cacheflow.sequence import SequenceGroupInputs
 from cacheflow.worker.worker import Worker


@@ -14,7 +15,8 @@ class Controller:
        block_size: int,
        num_gpu_blocks: int,
        num_cpu_blocks: int,
-        dtype: str = 'half',
+        dtype: str,
+        seed: int,
    ) -> None:
        self.node_id = node_id
        self.num_workers = num_workers
@@ -37,6 +39,7 @@ class Controller:
                num_gpu_blocks=num_gpu_blocks,
                num_cpu_blocks=num_cpu_blocks,
                dtype=dtype,
+                seed=seed,
            )
            self.workers.append(worker)

@@ -49,22 +52,16 @@ class Controller:

    def execute_stage(
        self,
-        prompt_tokens: Dict[int, List[int]],
-        generation_tokens: Dict[int, int],
-        context_lens: Dict[int, int],
-        block_tables: Dict[int, List[int]],
+        input_seq_groups: List[SequenceGroupInputs],
        blocks_to_swap_in: Dict[int, int],
        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
    ) -> None:
        # FIXME: Support tensor parallelism.
        assert len(self.workers) == 1
        worker = self.workers[0]
        output = worker.execute_stage(
-            prompt_tokens,
-            generation_tokens,
-            context_lens,
-            block_tables,
+            input_seq_groups,
            blocks_to_swap_in,
            blocks_to_swap_out,
            blocks_to_copy,