Collect system stats in scheduler & Add scripts for experiments (#30)

This commit is contained in:
Woosuk Kwon
2023-04-12 15:03:49 -07:00
committed by GitHub
parent e3cec88aa5
commit 84eee24e20
12 changed files with 830 additions and 3 deletions

View File

@@ -1,6 +1,8 @@
import enum
import os
import pickle
import time
from typing import Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple
from cacheflow.master.block_manager import BlockSpaceManager
from cacheflow.master.policy import PolicyFactory
@@ -34,12 +36,18 @@ class Scheduler:
num_gpu_blocks: int,
num_cpu_blocks: int,
max_num_batched_tokens: int,
max_num_sequences: int,
collect_stats: bool,
do_memory_analysis: bool = False,
) -> None:
self.controllers = controllers
self.block_size = block_size
self.num_gpu_blocks = num_gpu_blocks
self.num_cpu_blocks = num_cpu_blocks
self.max_num_batched_tokens = max_num_batched_tokens
self.max_num_sequences = max_num_sequences
self.collect_stats = collect_stats
self.do_memory_analysis = do_memory_analysis
# Instantiate the scheduling policy.
self.policy = PolicyFactory.get_policy(policy_name='fcfs')
@@ -61,6 +69,9 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
self.swapped: List[SequenceGroup] = []
# Performance-related statistics.
self.stats = Stats(num_gpu_blocks, num_cpu_blocks)
def add_sequence_groups(
self,
seq_groups: List[Tuple[SequenceGroup, SamplingParams]],
@@ -123,6 +134,12 @@ class Scheduler:
if not self.block_manager.can_swap_in(seq_group):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
if len(self.running) + num_seqs > self.max_num_sequences:
break
seq_group = self.swapped.pop(0)
self._swap_in(seq_group, blocks_to_swap_in)
self._append(seq_group, blocks_to_copy)
@@ -156,12 +173,68 @@ class Scheduler:
> self.max_num_batched_tokens):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs = seq_group.num_seqs(status=SequenceStatus.WAITING)
if len(self.running) + num_seqs > self.max_num_sequences:
break
seq_group = self.waiting.pop(0)
self._allocate(seq_group)
self.running.append(seq_group)
num_batched_tokens += num_prompt_tokens
prompt_group_ids.append(seq_group.group_id)
if self.collect_stats:
if self.running or blocks_to_swap_in or blocks_to_swap_out:
self.stats.timestamps.append(now - self.stats.start_time)
self.stats.input_lens.append(num_batched_tokens)
self.stats.swap_out_lens.append(len(blocks_to_swap_out) * self.block_size)
self.stats.swap_in_lens.append(len(blocks_to_swap_in) * self.block_size)
self.stats.num_preemption.append(len(preempted))
self.stats.num_swapped.append(len(self.swapped))
self.stats.num_running.append(len(self.running))
self.stats.num_waiting.append(len(self.waiting))
num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
self.stats.gpu_cache_usage.append(num_used_gpu_blocks / self.num_gpu_blocks)
num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
self.stats.cpu_cache_usage.append(num_used_cpu_blocks / self.num_cpu_blocks)
if self.do_memory_analysis:
block_tables = self.block_manager.block_tables
num_logical_blocks = 0
num_logical_tokens = 0
num_physical_blocks = 0
num_physical_tokens = 0
physical_block_numbers = set()
num_reserved_tokens = 0
for seq_group in self.running:
group_id = seq_group.group_id
sampling_params = self.sampling_params[group_id]
max_num_steps = sampling_params.max_num_steps
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
num_logical_blocks += len(seq.logical_token_blocks)
num_logical_tokens += seq.get_len()
seq_id = seq.seq_id
block_table = block_tables[seq_id]
for i, block in enumerate(block_table):
if block.block_number in physical_block_numbers:
continue
physical_block_numbers.add(block.block_number)
num_physical_blocks += 1
num_physical_tokens += seq.logical_token_blocks[i].num_tokens
assert num_physical_blocks == num_used_gpu_blocks
self.stats.num_logical_blocks.append(num_logical_blocks)
self.stats.num_logical_tokens.append(num_logical_tokens)
self.stats.num_physical_blocks.append(num_physical_blocks)
self.stats.num_physical_tokens.append(num_physical_tokens)
self.stats.num_reserved_tokens.append(num_reserved_tokens)
return (blocks_to_swap_in,
blocks_to_swap_out,
blocks_to_copy,
@@ -381,3 +454,75 @@ class Scheduler:
blocks_to_swap_out.update(mapping)
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
seq.status = SequenceStatus.SWAPPED
def reset_stats(self) -> None:
self.stats.reset(self.num_gpu_blocks, self.num_cpu_blocks)
def save_stats(
self,
output_dir: str,
) -> None:
assert self.collect_stats, 'Statistics collection is disabled.'
self.stats.save(output_dir)
class Stats:
def __init__(
self,
num_gpu_blocks: int,
num_cpu_blocks: int,
) -> None:
self.start_time: float = time.time()
self.num_gpu_blocks = num_gpu_blocks
self.num_cpu_blocks = num_cpu_blocks
self.timestamps: List[float] = []
self.input_lens: List[int] = []
self.swap_out_lens: List[int] = []
self.swap_in_lens: List[int] = []
self.num_preemption: List[int] = []
self.num_waiting: List[int] = []
self.num_running: List[int] = []
self.num_swapped: List[int] = []
self.gpu_cache_usage: List[float] = []
self.cpu_cache_usage: List[float] = []
self.num_logical_blocks: List[int] = []
self.num_logical_tokens: List[int] = []
self.num_physical_blocks: List[int] = []
self.num_physical_tokens: List[int] = []
self.num_reserved_tokens: List[int] = []
def reset(
self,
num_gpu_blocks: int,
num_cpu_blocks: int,
) -> None:
self.__init__(num_gpu_blocks, num_cpu_blocks)
def to_dict(self) -> Dict[str, Any]:
return {
'start_time': self.start_time,
'num_gpu_blocks': self.num_gpu_blocks,
'num_cpu_blocks': self.num_cpu_blocks,
'timestamps': self.timestamps,
'input_lens': self.input_lens,
'swap_out_lens': self.swap_out_lens,
'swap_in_lens': self.swap_in_lens,
'num_preemption': self.num_preemption,
'num_waiting': self.num_waiting,
'num_running': self.num_running,
'num_swapped': self.num_swapped,
'gpu_cache_usage': self.gpu_cache_usage,
'cpu_cache_usage': self.cpu_cache_usage,
'num_logical_blocks': self.num_logical_blocks,
'num_logical_tokens': self.num_logical_tokens,
'num_physical_blocks': self.num_physical_blocks,
'num_physical_tokens': self.num_physical_tokens,
'num_reserved_tokens': self.num_reserved_tokens,
}
def save(self, output_dir: str) -> None:
with open(os.path.join(output_dir, 'stats.pkl'), 'wb') as f:
pickle.dump(self.to_dict(), f)