FastAPI-based working frontend (#10)
This commit is contained in:
@@ -1,9 +1,7 @@
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from cacheflow.models.utils import get_cpu_memory
|
||||
from cacheflow.models.utils import get_dtype_size
|
||||
from cacheflow.models.utils import get_gpu_memory
|
||||
|
||||
_GiB = 1 << 30
|
||||
|
||||
@@ -31,11 +29,15 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
self.model_name = model_name
|
||||
self.block_size = block_size
|
||||
self.dtype = dtype
|
||||
self.gpu_memory = gpu_memory
|
||||
self.cpu_memory = cpu_memory
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
@@ -106,8 +108,7 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
memory_utilization: float = 0.95,
|
||||
) -> int:
|
||||
# NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
|
||||
gpu_memory = get_gpu_memory()
|
||||
usable_memory = int(memory_utilization * gpu_memory)
|
||||
usable_memory = int(memory_utilization * self.gpu_memory)
|
||||
|
||||
param_size = self._get_param_size()
|
||||
act_size = self._get_max_act_size(max_num_batched_tokens)
|
||||
@@ -122,16 +123,15 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
swap_space: int,
|
||||
) -> int:
|
||||
swap_space = swap_space * _GiB
|
||||
cpu_memory = get_cpu_memory()
|
||||
if swap_space > 0.8 * cpu_memory:
|
||||
if swap_space > 0.8 * self.cpu_memory:
|
||||
raise ValueError(f'The swap space ({swap_space / _GiB:.2f} GiB) '
|
||||
'takes more than 80% of the available memory '
|
||||
f'({cpu_memory / _GiB:.2f} GiB).'
|
||||
f'({self.cpu_memory / _GiB:.2f} GiB).'
|
||||
'Please check the swap space size.')
|
||||
if swap_space > 0.5 * cpu_memory:
|
||||
if swap_space > 0.5 * self.cpu_memory:
|
||||
print(f'WARNING: The swap space ({swap_space / _GiB:.2f} GiB) '
|
||||
'takes more than 50% of the available memory '
|
||||
f'({cpu_memory / _GiB:.2f} GiB).'
|
||||
f'({self.cpu_memory / _GiB:.2f} GiB).'
|
||||
'This may slow the system performance.')
|
||||
max_num_blocks = swap_space // self._get_cache_block_size()
|
||||
return max_num_blocks
|
||||
|
||||
Reference in New Issue
Block a user