[Core] Consolidate GB constant and enable float GB arguments (#7416)

This commit is contained in:
Cyrus Leung
2024-08-13 05:14:14 +08:00
committed by GitHub
parent 6aa33cb2dd
commit 4ddc4743d7
6 changed files with 21 additions and 21 deletions

View File

@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
from vllm.tracing import is_otel_installed
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH,
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
cuda_device_count_stateless, get_cpu_memory, is_cpu,
is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
print_warning_once)
@@ -27,7 +27,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
_GB = 1 << 30
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
_PP_SUPPORTED_MODELS = [
@@ -492,7 +491,7 @@ class CacheConfig:
self,
block_size: int,
gpu_memory_utilization: float,
swap_space: int,
swap_space: float,
cache_dtype: str,
num_gpu_blocks_override: Optional[int] = None,
sliding_window: Optional[int] = None,
@@ -501,7 +500,7 @@ class CacheConfig:
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
self.swap_space_bytes = swap_space * _GB
self.swap_space_bytes = swap_space * GiB_bytes
self.num_gpu_blocks_override = num_gpu_blocks_override
self.cache_dtype = cache_dtype
self.sliding_window = sliding_window
@@ -561,9 +560,9 @@ class CacheConfig:
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
"allocated for the swap space.")
msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
"is allocated for the swap space.")
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory: