[refactor] refactor memory constants usage (#31865)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
Ning Xie
2026-01-08 02:37:31 +08:00
committed by GitHub
parent f347ac6c34
commit c907d22158
9 changed files with 33 additions and 30 deletions

View File

@@ -247,7 +247,6 @@ def test_deep_sleep_async():
@requires_fp8
def test_deep_sleep_fp8_kvcache():
GiB_bytes = 1 << 30
model = "Qwen/Qwen2-0.5B"
used_bytes_baseline = current_platform.get_current_memory_usage()

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
@@ -10,7 +11,7 @@ from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import get_cpu_memory
from vllm.utils.mem_utils import format_gib, get_cpu_memory
if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
@@ -214,7 +215,7 @@ class CacheConfig:
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = self.swap_space * GiB_bytes
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
@@ -222,8 +223,8 @@ class CacheConfig:
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
msg = (
f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
f"{format_gib(cpu_memory_usage)} GiB out of the "
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:

View File

@@ -20,6 +20,7 @@ from vllm.logger import init_logger
from vllm.utils.cache import CacheInfo, LRUCache
from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
from vllm.utils.mem_utils import format_gib
from .inputs import (
MultiModalBatchedField,
@@ -130,9 +131,9 @@ class MultiModalCache:
if debug:
leaf_count = json_count_leaves(value)
logger.debug(
"Calculated size of %s to be %.2f GiB (%d leaves)",
"Calculated size of %s to be %s GiB (%d leaves)",
type(value),
size / GiB_bytes,
format_gib(size),
leaf_count,
)

View File

@@ -140,6 +140,7 @@ class CpuPlatform(Platform):
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
node_dir = "/sys/devices/system/node"
@@ -153,10 +154,9 @@ class CpuPlatform(Platform):
free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
DEFAULT_CPU_MEM_UTILIZATION = 0.5
kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
kv_cache_space_gib = kv_cache_space / GiB_bytes
logger.warning_once(
"VLLM_CPU_KVCACHE_SPACE not set. Using "
f"{kv_cache_space_gib:.2f} GiB for KV cache."
"VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.",
format_gib(kv_cache_space),
)
else:
kv_cache_space *= GiB_bytes

View File

@@ -11,11 +11,15 @@ import psutil
import torch
import torch.types
from .mem_constants import GiB_bytes
from .mem_constants import GiB_bytes, MiB_bytes
def format_gib(b: int) -> float:
return round(b / GiB_bytes, 2)
def format_mib(b: int) -> str:
return f"{round(b / MiB_bytes, 2)}"
def format_gib(b: int) -> str:
return f"{round(b / GiB_bytes, 2)}"
@cache

View File

@@ -14,7 +14,7 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.utils.hashing import sha256_cbor, xxhash_cbor
from vllm.utils.math_utils import cdiv
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib
from vllm.v1.kv_cache_interface import (
ChunkedLocalAttentionSpec,
FullAttentionSpec,
@@ -633,9 +633,9 @@ def _check_enough_kv_cache_memory(
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
f"when initializing the engine. "
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
@@ -1441,10 +1441,10 @@ def _auto_fit_max_model_len(
vllm_config.model_config.max_model_len = auto_fit_max
logger.info_once(
"Auto-fit max_model_len: reduced from %d to %d to fit in "
"available GPU memory (%.2f GiB available for KV cache)",
"available GPU memory (%s GiB available for KV cache)",
original_max,
auto_fit_max,
min_available_memory / GiB_bytes,
format_gib(min_available_memory),
scope="local",
)

View File

@@ -14,8 +14,7 @@ from vllm.config.compilation import CUDAGraphMode
from vllm.forward_context import set_forward_context
from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model_loader
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
@@ -165,8 +164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.model_memory_usage = m.consumed_memory
logger.info(
"Model loading took %.4f GiB and %.6f seconds",
m.consumed_memory / GiB_bytes,
"Model loading took %s GiB and %.6f seconds",
format_gib(m.consumed_memory),
time_after_load - time_before_load,
)

View File

@@ -93,8 +93,7 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
from vllm.utils.nvtx_pytorch_hooks import PytHooks
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import (
@@ -3899,8 +3898,8 @@ class GPUModelRunner(
logger.error(combined_msg)
raise e
logger.info_once(
"Model loading took %.4f GiB memory and %.6f seconds",
self.model_memory_usage / GiB_bytes,
"Model loading took %s GiB memory and %.6f seconds",
format_gib(self.model_memory_usage),
time_after_load - time_before_load,
scope="local",
)

View File

@@ -125,7 +125,7 @@ class Worker(WorkerBase):
used_bytes = total - free_bytes_after_sleep
assert freed_bytes >= 0, "Memory usage increased after sleeping."
logger.info(
"Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
"Sleep mode freed %s GiB memory, %s GiB memory is still in use.",
format_gib(freed_bytes),
format_gib(used_bytes),
)
@@ -342,19 +342,19 @@ class Worker(WorkerBase):
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
logger.debug(
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
"Initial free memory: %s GiB; Requested memory: %f (util), %s GiB",
format_gib(self.init_snapshot.free_memory),
self.cache_config.gpu_memory_utilization,
format_gib(self.requested_memory),
)
logger.debug(
"Free memory after profiling: %f GiB (total), %f GiB (within requested)",
"Free memory after profiling: %s GiB (total), %s GiB (within requested)",
format_gib(free_gpu_memory),
format_gib(free_gpu_memory - unrequested_memory),
)
logger.debug(profile_result)
logger.info_once(
"Available KV cache memory: %f GiB",
"Available KV cache memory: %s GiB",
format_gib(self.available_kv_cache_memory_bytes),
scope="local",
)