Use runtime profiling to replace manual memory analyzers (#81)

2023-05-19 11:35:44 -06:00
parent 825d8892b5
commit f756799b84
14 changed files with 211 additions and 478 deletions
--- a/cacheflow/worker/controller.py
+++ b/cacheflow/worker/controller.py
@@ -23,23 +23,18 @@ class Controller:
        pipeline_parallel_size: int,
        distributed_init_method: str,
        model_name: str,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
        dtype: str,
        seed: int,
        cache_dir: Optional[str],
        use_dummy_weights: bool,
        use_np_cache: bool,
        max_num_batched_tokens: int,
+        max_num_sequences: int,
        use_ray: bool,
    ) -> None:
        self.stage_id = stage_id
        self.stage_devices = stage_devices
        self.model_name = model_name
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
-        self.num_cpu_blocks = num_cpu_blocks
        self.use_ray = use_ray

        # Which pipeline stage is this node assigned to?
@@ -56,9 +51,6 @@ class Controller:
                worker_cls = Worker
            worker = worker_cls(
                model_name=model_name,
-                block_size=block_size,
-                num_gpu_blocks=num_gpu_blocks,
-                num_cpu_blocks=num_cpu_blocks,
                dtype=dtype,
                seed=seed,
                distributed_init_method=distributed_init_method,
@@ -70,9 +62,44 @@ class Controller:
                use_dummy_weights=use_dummy_weights,
                use_np_cache=use_np_cache,
                max_num_batched_tokens=max_num_batched_tokens,
+                max_num_sequences=max_num_sequences,
            )
            self.workers.append(worker)

+    def get_num_available_blocks(self, block_size: int, cpu_swap_space: int,
+                                 gpu_memory_utilization: float) -> List[Tuple[int, int]]:
+        all_worker_results = []
+        for worker in self.workers:
+            executor = worker.get_num_available_blocks
+            if self.use_ray:
+                executor = executor.remote
+
+            result = executor(
+                block_size,
+                cpu_swap_space,
+                gpu_memory_utilization,
+            )
+            all_worker_results.append(result)
+        if self.use_ray:
+            all_worker_results = ray.get(all_worker_results)
+        return all_worker_results
+
+    def init_cache_engine(self, block_size: int, num_gpu_blocks: int,
+                          num_cpu_blocks: int):
+        all_worker_futures = []
+        for worker in self.workers:
+            executor = worker.init_cache_engine
+            if self.use_ray:
+                executor = executor.remote
+            future = executor(
+                block_size,
+                num_gpu_blocks,
+                num_cpu_blocks,
+            )
+            all_worker_futures.append(future)
+        if self.use_ray:
+            ray.get(all_worker_futures)
+
    def set_next(
        self,
        next_node: Union['Controller', 'Scheduler'],