FastAPI-based working frontend (#10)

2023-03-29 14:48:56 +08:00
parent d359cda5fa
commit 721fa3df15
15 changed files with 536 additions and 146 deletions
--- a/cacheflow/models/memory_analyzer.py
+++ b/cacheflow/models/memory_analyzer.py
@@ -1,9 +1,7 @@
 import torch
 from transformers import AutoConfig

-from cacheflow.models.utils import get_cpu_memory
 from cacheflow.models.utils import get_dtype_size
-from cacheflow.models.utils import get_gpu_memory

 _GiB = 1 << 30

@@ -31,11 +29,15 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        model_name: str,
        block_size: int,
        dtype: torch.dtype,
+        gpu_memory: int,
+        cpu_memory: int,
        tensor_parallel_size: int,
    ) -> None:
        self.model_name = model_name
        self.block_size = block_size
        self.dtype = dtype
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
        self.tensor_parallel_size = tensor_parallel_size

        config = AutoConfig.from_pretrained(model_name)
@@ -106,8 +108,7 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        memory_utilization: float = 0.95,
    ) -> int:
        # NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
-        gpu_memory = get_gpu_memory()
-        usable_memory = int(memory_utilization * gpu_memory)
+        usable_memory = int(memory_utilization * self.gpu_memory)

        param_size = self._get_param_size()
        act_size = self._get_max_act_size(max_num_batched_tokens)
@@ -122,16 +123,15 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        swap_space: int,
    ) -> int:
        swap_space = swap_space * _GiB
-        cpu_memory = get_cpu_memory()
-        if swap_space > 0.8 * cpu_memory:
+        if swap_space > 0.8 * self.cpu_memory:
            raise ValueError(f'The swap space ({swap_space / _GiB:.2f} GiB) '
                             'takes more than 80% of the available memory '
-                             f'({cpu_memory / _GiB:.2f} GiB).'
+                             f'({self.cpu_memory / _GiB:.2f} GiB).'
                             'Please check the swap space size.')
-        if swap_space > 0.5 * cpu_memory:
+        if swap_space > 0.5 * self.cpu_memory:
            print(f'WARNING: The swap space ({swap_space / _GiB:.2f} GiB) '
                  'takes more than 50% of the available memory '
-                  f'({cpu_memory / _GiB:.2f} GiB).'
+                  f'({self.cpu_memory / _GiB:.2f} GiB).'
                  'This may slow the system performance.')
        max_num_blocks = swap_space // self._get_cache_block_size()
        return max_num_blocks