[Core] Consolidate GB constant and enable float GB arguments (#7416)
This commit is contained in:
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.tracing import is_otel_installed
|
||||
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
||||
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH,
|
||||
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
|
||||
cuda_device_count_stateless, get_cpu_memory, is_cpu,
|
||||
is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
|
||||
print_warning_once)
|
||||
@@ -27,7 +27,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_GB = 1 << 30
|
||||
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
|
||||
|
||||
_PP_SUPPORTED_MODELS = [
|
||||
@@ -492,7 +491,7 @@ class CacheConfig:
|
||||
self,
|
||||
block_size: int,
|
||||
gpu_memory_utilization: float,
|
||||
swap_space: int,
|
||||
swap_space: float,
|
||||
cache_dtype: str,
|
||||
num_gpu_blocks_override: Optional[int] = None,
|
||||
sliding_window: Optional[int] = None,
|
||||
@@ -501,7 +500,7 @@ class CacheConfig:
|
||||
) -> None:
|
||||
self.block_size = block_size
|
||||
self.gpu_memory_utilization = gpu_memory_utilization
|
||||
self.swap_space_bytes = swap_space * _GB
|
||||
self.swap_space_bytes = swap_space * GiB_bytes
|
||||
self.num_gpu_blocks_override = num_gpu_blocks_override
|
||||
self.cache_dtype = cache_dtype
|
||||
self.sliding_window = sliding_window
|
||||
@@ -561,9 +560,9 @@ class CacheConfig:
|
||||
num_gpus_per_node = parallel_config.tensor_parallel_size
|
||||
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
|
||||
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
|
||||
"allocated for the swap space.")
|
||||
msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
|
||||
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
|
||||
"is allocated for the swap space.")
|
||||
if cpu_memory_usage > 0.7 * total_cpu_memory:
|
||||
raise ValueError("Too large swap space. " + msg)
|
||||
elif cpu_memory_usage > 0.4 * total_cpu_memory:
|
||||
|
||||
Reference in New Issue
Block a user