Implement preemption via recomputation & Refactor scheduling logic (#12)

2023-03-30 14:51:46 -07:00
parent 88c0268a18
commit 7a7929abe8
7 changed files with 277 additions and 124 deletions
--- a/cacheflow/master/block_manager.py
+++ b/cacheflow/master/block_manager.py
@@ -76,7 +76,8 @@ class BlockSpaceManager:
        self.block_tables: Dict[int, BlockTable] = {}

    def can_allocate(self, seq_group: SequenceGroup) -> bool:
-        # NOTE: Here we assume that all sequences in the group have the same prompt.
+        # FIXME(woosuk): Here we assume that all sequences in the group share
+        # the same prompt. This may not be true for preempted sequences.
        seq = seq_group.seqs[0]
        num_required_blocks = len(seq.logical_token_blocks)
        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()