diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 3bd0b6609..b1a16cfca 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -247,7 +247,6 @@ def test_deep_sleep_async(): @requires_fp8 def test_deep_sleep_fp8_kvcache(): - GiB_bytes = 1 << 30 model = "Qwen/Qwen2-0.5B" used_bytes_baseline = current_platform.get_current_memory_usage() diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 067799a44..318efc82a 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math from dataclasses import field from typing import TYPE_CHECKING, Any, Literal @@ -10,7 +11,7 @@ from pydantic.dataclasses import dataclass from vllm.config.utils import config from vllm.logger import init_logger from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import get_cpu_memory +from vllm.utils.mem_utils import format_gib, get_cpu_memory if TYPE_CHECKING: from vllm.config.parallel import ParallelConfig @@ -214,7 +215,7 @@ class CacheConfig: self, parallel_config: ParallelConfig, ) -> None: - swap_space_bytes = self.swap_space * GiB_bytes + swap_space_bytes = math.ceil(self.swap_space * GiB_bytes) total_cpu_memory = get_cpu_memory() # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel # group are in the same node. However, the GPUs may span multiple nodes. @@ -222,8 +223,8 @@ class CacheConfig: cpu_memory_usage = swap_space_bytes * num_gpus_per_node msg = ( - f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the " - f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory " + f"{format_gib(cpu_memory_usage)} GiB out of the " + f"{format_gib(total_cpu_memory)} GiB total CPU memory " "is allocated for the swap space." ) if cpu_memory_usage > 0.7 * total_cpu_memory: diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 67bdf5e15..f22a14a1d 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -20,6 +20,7 @@ from vllm.logger import init_logger from vllm.utils.cache import CacheInfo, LRUCache from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves from vllm.utils.mem_constants import GiB_bytes, MiB_bytes +from vllm.utils.mem_utils import format_gib from .inputs import ( MultiModalBatchedField, @@ -130,9 +131,9 @@ class MultiModalCache: if debug: leaf_count = json_count_leaves(value) logger.debug( - "Calculated size of %s to be %.2f GiB (%d leaves)", + "Calculated size of %s to be %s GiB (%d leaves)", type(value), - size / GiB_bytes, + format_gib(size), leaf_count, ) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 9fbcb7678..c3adc0036 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -140,6 +140,7 @@ class CpuPlatform(Platform): @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: from vllm.utils.mem_constants import GiB_bytes + from vllm.utils.mem_utils import format_gib kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE node_dir = "/sys/devices/system/node" @@ -153,10 +154,9 @@ class CpuPlatform(Platform): free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes DEFAULT_CPU_MEM_UTILIZATION = 0.5 kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION) - kv_cache_space_gib = kv_cache_space / GiB_bytes logger.warning_once( - "VLLM_CPU_KVCACHE_SPACE not set. Using " - f"{kv_cache_space_gib:.2f} GiB for KV cache." + "VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.", + format_gib(kv_cache_space), ) else: kv_cache_space *= GiB_bytes diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index 2cda55796..12d1541ad 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -11,11 +11,15 @@ import psutil import torch import torch.types -from .mem_constants import GiB_bytes +from .mem_constants import GiB_bytes, MiB_bytes -def format_gib(b: int) -> float: - return round(b / GiB_bytes, 2) +def format_mib(b: int) -> str: + return f"{round(b / MiB_bytes, 2)}" + + +def format_gib(b: int) -> str: + return f"{round(b / GiB_bytes, 2)}" @cache diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 1480a1f79..7f900bd9e 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -14,7 +14,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils.hashing import sha256_cbor, xxhash_cbor from vllm.utils.math_utils import cdiv -from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import format_gib from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, FullAttentionSpec, @@ -633,9 +633,9 @@ def _check_enough_kv_cache_memory( raise ValueError( f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " + f"({max_model_len}), ({format_gib(needed_memory)} GiB KV " f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}" + f"memory ({format_gib(available_memory)} GiB). {estimated_msg}" f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " f"when initializing the engine. " f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " @@ -1441,10 +1441,10 @@ def _auto_fit_max_model_len( vllm_config.model_config.max_model_len = auto_fit_max logger.info_once( "Auto-fit max_model_len: reduced from %d to %d to fit in " - "available GPU memory (%.2f GiB available for KV cache)", + "available GPU memory (%s GiB available for KV cache)", original_max, auto_fit_max, - min_available_memory / GiB_bytes, + format_gib(min_available_memory), scope="local", ) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 204635635..9c170d465 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -14,8 +14,7 @@ from vllm.config.compilation import CUDAGraphMode from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model_loader -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput @@ -165,8 +164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.model_memory_usage = m.consumed_memory logger.info( - "Model loading took %.4f GiB and %.6f seconds", - m.consumed_memory / GiB_bytes, + "Model loading took %s GiB and %.6f seconds", + format_gib(m.consumed_memory), time_after_load - time_before_load, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9e8aaeb26..07d5c282c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -93,8 +93,7 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.utils.jsontree import json_map_leaves from vllm.utils.math_utils import cdiv, round_up -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib from vllm.utils.nvtx_pytorch_hooks import PytHooks from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import ( @@ -3899,8 +3898,8 @@ class GPUModelRunner( logger.error(combined_msg) raise e logger.info_once( - "Model loading took %.4f GiB memory and %.6f seconds", - self.model_memory_usage / GiB_bytes, + "Model loading took %s GiB memory and %.6f seconds", + format_gib(self.model_memory_usage), time_after_load - time_before_load, scope="local", ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a1cbafe2d..8cdd77098 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -125,7 +125,7 @@ class Worker(WorkerBase): used_bytes = total - free_bytes_after_sleep assert freed_bytes >= 0, "Memory usage increased after sleeping." logger.info( - "Sleep mode freed %f GiB memory, %f GiB memory is still in use.", + "Sleep mode freed %s GiB memory, %s GiB memory is still in use.", format_gib(freed_bytes), format_gib(used_bytes), ) @@ -342,19 +342,19 @@ class Worker(WorkerBase): unrequested_memory = self.init_snapshot.free_memory - self.requested_memory logger.debug( - "Initial free memory: %f GiB; Requested memory: %f (util), %f GiB", + "Initial free memory: %s GiB; Requested memory: %f (util), %s GiB", format_gib(self.init_snapshot.free_memory), self.cache_config.gpu_memory_utilization, format_gib(self.requested_memory), ) logger.debug( - "Free memory after profiling: %f GiB (total), %f GiB (within requested)", + "Free memory after profiling: %s GiB (total), %s GiB (within requested)", format_gib(free_gpu_memory), format_gib(free_gpu_memory - unrequested_memory), ) logger.debug(profile_result) logger.info_once( - "Available KV cache memory: %f GiB", + "Available KV cache memory: %s GiB", format_gib(self.available_kv_cache_memory_bytes), scope="local", )