Collect system stats in scheduler & Add scripts for experiments (#30)
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import enum
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from cacheflow.master.block_manager import BlockSpaceManager
|
||||
from cacheflow.master.policy import PolicyFactory
|
||||
@@ -34,12 +36,18 @@ class Scheduler:
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
max_num_batched_tokens: int,
|
||||
max_num_sequences: int,
|
||||
collect_stats: bool,
|
||||
do_memory_analysis: bool = False,
|
||||
) -> None:
|
||||
self.controllers = controllers
|
||||
self.block_size = block_size
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.num_cpu_blocks = num_cpu_blocks
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
self.max_num_sequences = max_num_sequences
|
||||
self.collect_stats = collect_stats
|
||||
self.do_memory_analysis = do_memory_analysis
|
||||
|
||||
# Instantiate the scheduling policy.
|
||||
self.policy = PolicyFactory.get_policy(policy_name='fcfs')
|
||||
@@ -61,6 +69,9 @@ class Scheduler:
|
||||
# Sequence groups in the SWAPPED state.
|
||||
self.swapped: List[SequenceGroup] = []
|
||||
|
||||
# Performance-related statistics.
|
||||
self.stats = Stats(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def add_sequence_groups(
|
||||
self,
|
||||
seq_groups: List[Tuple[SequenceGroup, SamplingParams]],
|
||||
@@ -123,6 +134,12 @@ class Scheduler:
|
||||
if not self.block_manager.can_swap_in(seq_group):
|
||||
break
|
||||
|
||||
# The total number of sequences in the RUNNING state should not
|
||||
# exceed the maximum number of sequences.
|
||||
num_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
|
||||
if len(self.running) + num_seqs > self.max_num_sequences:
|
||||
break
|
||||
|
||||
seq_group = self.swapped.pop(0)
|
||||
self._swap_in(seq_group, blocks_to_swap_in)
|
||||
self._append(seq_group, blocks_to_copy)
|
||||
@@ -156,12 +173,68 @@ class Scheduler:
|
||||
> self.max_num_batched_tokens):
|
||||
break
|
||||
|
||||
# The total number of sequences in the RUNNING state should not
|
||||
# exceed the maximum number of sequences.
|
||||
num_seqs = seq_group.num_seqs(status=SequenceStatus.WAITING)
|
||||
if len(self.running) + num_seqs > self.max_num_sequences:
|
||||
break
|
||||
|
||||
seq_group = self.waiting.pop(0)
|
||||
self._allocate(seq_group)
|
||||
self.running.append(seq_group)
|
||||
num_batched_tokens += num_prompt_tokens
|
||||
prompt_group_ids.append(seq_group.group_id)
|
||||
|
||||
if self.collect_stats:
|
||||
if self.running or blocks_to_swap_in or blocks_to_swap_out:
|
||||
self.stats.timestamps.append(now - self.stats.start_time)
|
||||
self.stats.input_lens.append(num_batched_tokens)
|
||||
self.stats.swap_out_lens.append(len(blocks_to_swap_out) * self.block_size)
|
||||
self.stats.swap_in_lens.append(len(blocks_to_swap_in) * self.block_size)
|
||||
self.stats.num_preemption.append(len(preempted))
|
||||
self.stats.num_swapped.append(len(self.swapped))
|
||||
self.stats.num_running.append(len(self.running))
|
||||
self.stats.num_waiting.append(len(self.waiting))
|
||||
|
||||
num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
|
||||
num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
|
||||
self.stats.gpu_cache_usage.append(num_used_gpu_blocks / self.num_gpu_blocks)
|
||||
num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
|
||||
num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
|
||||
self.stats.cpu_cache_usage.append(num_used_cpu_blocks / self.num_cpu_blocks)
|
||||
|
||||
if self.do_memory_analysis:
|
||||
block_tables = self.block_manager.block_tables
|
||||
num_logical_blocks = 0
|
||||
num_logical_tokens = 0
|
||||
num_physical_blocks = 0
|
||||
num_physical_tokens = 0
|
||||
physical_block_numbers = set()
|
||||
num_reserved_tokens = 0
|
||||
for seq_group in self.running:
|
||||
group_id = seq_group.group_id
|
||||
sampling_params = self.sampling_params[group_id]
|
||||
max_num_steps = sampling_params.max_num_steps
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||
num_logical_blocks += len(seq.logical_token_blocks)
|
||||
num_logical_tokens += seq.get_len()
|
||||
|
||||
seq_id = seq.seq_id
|
||||
block_table = block_tables[seq_id]
|
||||
for i, block in enumerate(block_table):
|
||||
if block.block_number in physical_block_numbers:
|
||||
continue
|
||||
physical_block_numbers.add(block.block_number)
|
||||
num_physical_blocks += 1
|
||||
num_physical_tokens += seq.logical_token_blocks[i].num_tokens
|
||||
|
||||
assert num_physical_blocks == num_used_gpu_blocks
|
||||
self.stats.num_logical_blocks.append(num_logical_blocks)
|
||||
self.stats.num_logical_tokens.append(num_logical_tokens)
|
||||
self.stats.num_physical_blocks.append(num_physical_blocks)
|
||||
self.stats.num_physical_tokens.append(num_physical_tokens)
|
||||
self.stats.num_reserved_tokens.append(num_reserved_tokens)
|
||||
|
||||
return (blocks_to_swap_in,
|
||||
blocks_to_swap_out,
|
||||
blocks_to_copy,
|
||||
@@ -381,3 +454,75 @@ class Scheduler:
|
||||
blocks_to_swap_out.update(mapping)
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||
seq.status = SequenceStatus.SWAPPED
|
||||
|
||||
def reset_stats(self) -> None:
|
||||
self.stats.reset(self.num_gpu_blocks, self.num_cpu_blocks)
|
||||
|
||||
def save_stats(
|
||||
self,
|
||||
output_dir: str,
|
||||
) -> None:
|
||||
assert self.collect_stats, 'Statistics collection is disabled.'
|
||||
self.stats.save(output_dir)
|
||||
|
||||
|
||||
class Stats:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
) -> None:
|
||||
self.start_time: float = time.time()
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
self.timestamps: List[float] = []
|
||||
self.input_lens: List[int] = []
|
||||
self.swap_out_lens: List[int] = []
|
||||
self.swap_in_lens: List[int] = []
|
||||
self.num_preemption: List[int] = []
|
||||
self.num_waiting: List[int] = []
|
||||
self.num_running: List[int] = []
|
||||
self.num_swapped: List[int] = []
|
||||
self.gpu_cache_usage: List[float] = []
|
||||
self.cpu_cache_usage: List[float] = []
|
||||
|
||||
self.num_logical_blocks: List[int] = []
|
||||
self.num_logical_tokens: List[int] = []
|
||||
self.num_physical_blocks: List[int] = []
|
||||
self.num_physical_tokens: List[int] = []
|
||||
self.num_reserved_tokens: List[int] = []
|
||||
|
||||
def reset(
|
||||
self,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
) -> None:
|
||||
self.__init__(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'start_time': self.start_time,
|
||||
'num_gpu_blocks': self.num_gpu_blocks,
|
||||
'num_cpu_blocks': self.num_cpu_blocks,
|
||||
'timestamps': self.timestamps,
|
||||
'input_lens': self.input_lens,
|
||||
'swap_out_lens': self.swap_out_lens,
|
||||
'swap_in_lens': self.swap_in_lens,
|
||||
'num_preemption': self.num_preemption,
|
||||
'num_waiting': self.num_waiting,
|
||||
'num_running': self.num_running,
|
||||
'num_swapped': self.num_swapped,
|
||||
'gpu_cache_usage': self.gpu_cache_usage,
|
||||
'cpu_cache_usage': self.cpu_cache_usage,
|
||||
'num_logical_blocks': self.num_logical_blocks,
|
||||
'num_logical_tokens': self.num_logical_tokens,
|
||||
'num_physical_blocks': self.num_physical_blocks,
|
||||
'num_physical_tokens': self.num_physical_tokens,
|
||||
'num_reserved_tokens': self.num_reserved_tokens,
|
||||
}
|
||||
|
||||
def save(self, output_dir: str) -> None:
|
||||
with open(os.path.join(output_dir, 'stats.pkl'), 'wb') as f:
|
||||
pickle.dump(self.to_dict(), f)
|
||||
|
||||
Reference in New Issue
Block a user