Collect system stats in scheduler & Add scripts for experiments (#30)

2023-04-12 15:03:49 -07:00
parent e3cec88aa5
commit 84eee24e20
12 changed files with 830 additions and 3 deletions
--- a/cacheflow/master/scheduler.py
+++ b/cacheflow/master/scheduler.py
@@ -1,6 +1,8 @@
 import enum
+import os
+import pickle
 import time
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from cacheflow.master.block_manager import BlockSpaceManager
 from cacheflow.master.policy import PolicyFactory
@@ -34,12 +36,18 @@ class Scheduler:
        num_gpu_blocks: int,
        num_cpu_blocks: int,
        max_num_batched_tokens: int,
+        max_num_sequences: int,
+        collect_stats: bool,
+        do_memory_analysis: bool = False,
    ) -> None:
        self.controllers = controllers
        self.block_size = block_size
        self.num_gpu_blocks = num_gpu_blocks
        self.num_cpu_blocks = num_cpu_blocks
        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_num_sequences = max_num_sequences
+        self.collect_stats = collect_stats
+        self.do_memory_analysis = do_memory_analysis

        # Instantiate the scheduling policy.
        self.policy = PolicyFactory.get_policy(policy_name='fcfs')
@@ -61,6 +69,9 @@ class Scheduler:
        # Sequence groups in the SWAPPED state.
        self.swapped: List[SequenceGroup] = []

+        # Performance-related statistics.
+        self.stats = Stats(num_gpu_blocks, num_cpu_blocks)
+
    def add_sequence_groups(
        self,
        seq_groups: List[Tuple[SequenceGroup, SamplingParams]],
@@ -123,6 +134,12 @@ class Scheduler:
            if not self.block_manager.can_swap_in(seq_group):
                break

+            # The total number of sequences in the RUNNING state should not
+            # exceed the maximum number of sequences.
+            num_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
+            if len(self.running) + num_seqs > self.max_num_sequences:
+                break
+
            seq_group = self.swapped.pop(0)
            self._swap_in(seq_group, blocks_to_swap_in)
            self._append(seq_group, blocks_to_copy)
@@ -156,12 +173,68 @@ class Scheduler:
                    > self.max_num_batched_tokens):
                    break

+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_seqs = seq_group.num_seqs(status=SequenceStatus.WAITING)
+                if len(self.running) + num_seqs > self.max_num_sequences:
+                    break
+
                seq_group = self.waiting.pop(0)
                self._allocate(seq_group)
                self.running.append(seq_group)
                num_batched_tokens += num_prompt_tokens
                prompt_group_ids.append(seq_group.group_id)

+        if self.collect_stats:
+            if self.running or blocks_to_swap_in or blocks_to_swap_out:
+                self.stats.timestamps.append(now - self.stats.start_time)
+                self.stats.input_lens.append(num_batched_tokens)
+                self.stats.swap_out_lens.append(len(blocks_to_swap_out) * self.block_size)
+                self.stats.swap_in_lens.append(len(blocks_to_swap_in) * self.block_size)
+                self.stats.num_preemption.append(len(preempted))
+                self.stats.num_swapped.append(len(self.swapped))
+                self.stats.num_running.append(len(self.running))
+                self.stats.num_waiting.append(len(self.waiting))
+
+                num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
+                num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
+                self.stats.gpu_cache_usage.append(num_used_gpu_blocks / self.num_gpu_blocks)
+                num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
+                num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
+                self.stats.cpu_cache_usage.append(num_used_cpu_blocks / self.num_cpu_blocks)
+
+                if self.do_memory_analysis:
+                    block_tables = self.block_manager.block_tables
+                    num_logical_blocks = 0
+                    num_logical_tokens = 0
+                    num_physical_blocks = 0
+                    num_physical_tokens = 0
+                    physical_block_numbers = set()
+                    num_reserved_tokens = 0
+                    for seq_group in self.running:
+                        group_id = seq_group.group_id
+                        sampling_params = self.sampling_params[group_id]
+                        max_num_steps = sampling_params.max_num_steps
+                        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                            num_logical_blocks += len(seq.logical_token_blocks)
+                            num_logical_tokens += seq.get_len()
+
+                            seq_id = seq.seq_id
+                            block_table = block_tables[seq_id]
+                            for i, block in enumerate(block_table):
+                                if block.block_number in physical_block_numbers:
+                                    continue
+                                physical_block_numbers.add(block.block_number)
+                                num_physical_blocks += 1
+                                num_physical_tokens += seq.logical_token_blocks[i].num_tokens
+                    
+                    assert num_physical_blocks == num_used_gpu_blocks
+                    self.stats.num_logical_blocks.append(num_logical_blocks)
+                    self.stats.num_logical_tokens.append(num_logical_tokens)
+                    self.stats.num_physical_blocks.append(num_physical_blocks)
+                    self.stats.num_physical_tokens.append(num_physical_tokens)
+                    self.stats.num_reserved_tokens.append(num_reserved_tokens)
+
        return (blocks_to_swap_in,
                blocks_to_swap_out,
                blocks_to_copy,
@@ -381,3 +454,75 @@ class Scheduler:
        blocks_to_swap_out.update(mapping)
        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
            seq.status = SequenceStatus.SWAPPED
+
+    def reset_stats(self) -> None:
+        self.stats.reset(self.num_gpu_blocks, self.num_cpu_blocks)
+
+    def save_stats(
+        self,
+        output_dir: str,
+    ) -> None:
+        assert self.collect_stats, 'Statistics collection is disabled.'
+        self.stats.save(output_dir)
+
+
+class Stats:
+
+    def __init__(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        self.start_time: float = time.time()
+        self.num_gpu_blocks = num_gpu_blocks
+        self.num_cpu_blocks = num_cpu_blocks
+
+        self.timestamps: List[float] = []
+        self.input_lens: List[int] = []
+        self.swap_out_lens: List[int] = []
+        self.swap_in_lens: List[int] = []
+        self.num_preemption: List[int] = []
+        self.num_waiting: List[int] = []
+        self.num_running: List[int] = []
+        self.num_swapped: List[int] = []
+        self.gpu_cache_usage: List[float] = []
+        self.cpu_cache_usage: List[float] = []
+
+        self.num_logical_blocks: List[int] = []
+        self.num_logical_tokens: List[int] = []
+        self.num_physical_blocks: List[int] = []
+        self.num_physical_tokens: List[int] = []
+        self.num_reserved_tokens: List[int] = []
+
+    def reset(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        self.__init__(num_gpu_blocks, num_cpu_blocks)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'start_time': self.start_time,
+            'num_gpu_blocks': self.num_gpu_blocks,
+            'num_cpu_blocks': self.num_cpu_blocks,
+            'timestamps': self.timestamps,
+            'input_lens': self.input_lens,
+            'swap_out_lens': self.swap_out_lens,
+            'swap_in_lens': self.swap_in_lens,
+            'num_preemption': self.num_preemption,
+            'num_waiting': self.num_waiting,
+            'num_running': self.num_running,
+            'num_swapped': self.num_swapped,
+            'gpu_cache_usage': self.gpu_cache_usage,
+            'cpu_cache_usage': self.cpu_cache_usage,
+            'num_logical_blocks': self.num_logical_blocks,
+            'num_logical_tokens': self.num_logical_tokens,
+            'num_physical_blocks': self.num_physical_blocks,
+            'num_physical_tokens': self.num_physical_tokens,
+            'num_reserved_tokens': self.num_reserved_tokens,
+        }
+
+    def save(self, output_dir: str) -> None:
+        with open(os.path.join(output_dir, 'stats.pkl'), 'wb') as f:
+            pickle.dump(self.to_dict(), f)