[refactor] refactor memory constants usage (#31865)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
@@ -247,7 +247,6 @@ def test_deep_sleep_async():
|
||||
|
||||
@requires_fp8
|
||||
def test_deep_sleep_fp8_kvcache():
|
||||
GiB_bytes = 1 << 30
|
||||
model = "Qwen/Qwen2-0.5B"
|
||||
used_bytes_baseline = current_platform.get_current_memory_usage()
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
@@ -10,7 +11,7 @@ from pydantic.dataclasses import dataclass
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import get_cpu_memory
|
||||
from vllm.utils.mem_utils import format_gib, get_cpu_memory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
@@ -214,7 +215,7 @@ class CacheConfig:
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> None:
|
||||
swap_space_bytes = self.swap_space * GiB_bytes
|
||||
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
|
||||
total_cpu_memory = get_cpu_memory()
|
||||
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
|
||||
# group are in the same node. However, the GPUs may span multiple nodes.
|
||||
@@ -222,8 +223,8 @@ class CacheConfig:
|
||||
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (
|
||||
f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
|
||||
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
|
||||
f"{format_gib(cpu_memory_usage)} GiB out of the "
|
||||
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
|
||||
"is allocated for the swap space."
|
||||
)
|
||||
if cpu_memory_usage > 0.7 * total_cpu_memory:
|
||||
|
||||
@@ -20,6 +20,7 @@ from vllm.logger import init_logger
|
||||
from vllm.utils.cache import CacheInfo, LRUCache
|
||||
from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
|
||||
from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
|
||||
from vllm.utils.mem_utils import format_gib
|
||||
|
||||
from .inputs import (
|
||||
MultiModalBatchedField,
|
||||
@@ -130,9 +131,9 @@ class MultiModalCache:
|
||||
if debug:
|
||||
leaf_count = json_count_leaves(value)
|
||||
logger.debug(
|
||||
"Calculated size of %s to be %.2f GiB (%d leaves)",
|
||||
"Calculated size of %s to be %s GiB (%d leaves)",
|
||||
type(value),
|
||||
size / GiB_bytes,
|
||||
format_gib(size),
|
||||
leaf_count,
|
||||
)
|
||||
|
||||
|
||||
@@ -140,6 +140,7 @@ class CpuPlatform(Platform):
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import format_gib
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
node_dir = "/sys/devices/system/node"
|
||||
@@ -153,10 +154,9 @@ class CpuPlatform(Platform):
|
||||
free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
|
||||
DEFAULT_CPU_MEM_UTILIZATION = 0.5
|
||||
kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
|
||||
kv_cache_space_gib = kv_cache_space / GiB_bytes
|
||||
logger.warning_once(
|
||||
"VLLM_CPU_KVCACHE_SPACE not set. Using "
|
||||
f"{kv_cache_space_gib:.2f} GiB for KV cache."
|
||||
"VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.",
|
||||
format_gib(kv_cache_space),
|
||||
)
|
||||
else:
|
||||
kv_cache_space *= GiB_bytes
|
||||
|
||||
@@ -11,11 +11,15 @@ import psutil
|
||||
import torch
|
||||
import torch.types
|
||||
|
||||
from .mem_constants import GiB_bytes
|
||||
from .mem_constants import GiB_bytes, MiB_bytes
|
||||
|
||||
|
||||
def format_gib(b: int) -> float:
|
||||
return round(b / GiB_bytes, 2)
|
||||
def format_mib(b: int) -> str:
|
||||
return f"{round(b / MiB_bytes, 2)}"
|
||||
|
||||
|
||||
def format_gib(b: int) -> str:
|
||||
return f"{round(b / GiB_bytes, 2)}"
|
||||
|
||||
|
||||
@cache
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import sha256_cbor, xxhash_cbor
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import format_gib
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
ChunkedLocalAttentionSpec,
|
||||
FullAttentionSpec,
|
||||
@@ -633,9 +633,9 @@ def _check_enough_kv_cache_memory(
|
||||
|
||||
raise ValueError(
|
||||
f"To serve at least one request with the models's max seq len "
|
||||
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
|
||||
f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
|
||||
f"cache is needed, which is larger than the available KV cache "
|
||||
f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
|
||||
f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
|
||||
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
|
||||
f"when initializing the engine. "
|
||||
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||
@@ -1441,10 +1441,10 @@ def _auto_fit_max_model_len(
|
||||
vllm_config.model_config.max_model_len = auto_fit_max
|
||||
logger.info_once(
|
||||
"Auto-fit max_model_len: reduced from %d to %d to fit in "
|
||||
"available GPU memory (%.2f GiB available for KV cache)",
|
||||
"available GPU memory (%s GiB available for KV cache)",
|
||||
original_max,
|
||||
auto_fit_max,
|
||||
min_available_memory / GiB_bytes,
|
||||
format_gib(min_available_memory),
|
||||
scope="local",
|
||||
)
|
||||
|
||||
|
||||
@@ -14,8 +14,7 @@ from vllm.config.compilation import CUDAGraphMode
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
@@ -165,8 +164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
self.model_memory_usage = m.consumed_memory
|
||||
logger.info(
|
||||
"Model loading took %.4f GiB and %.6f seconds",
|
||||
m.consumed_memory / GiB_bytes,
|
||||
"Model loading took %s GiB and %.6f seconds",
|
||||
format_gib(m.consumed_memory),
|
||||
time_after_load - time_before_load,
|
||||
)
|
||||
|
||||
|
||||
@@ -93,8 +93,7 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
|
||||
from vllm.utils.nvtx_pytorch_hooks import PytHooks
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import (
|
||||
@@ -3899,8 +3898,8 @@ class GPUModelRunner(
|
||||
logger.error(combined_msg)
|
||||
raise e
|
||||
logger.info_once(
|
||||
"Model loading took %.4f GiB memory and %.6f seconds",
|
||||
self.model_memory_usage / GiB_bytes,
|
||||
"Model loading took %s GiB memory and %.6f seconds",
|
||||
format_gib(self.model_memory_usage),
|
||||
time_after_load - time_before_load,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
@@ -125,7 +125,7 @@ class Worker(WorkerBase):
|
||||
used_bytes = total - free_bytes_after_sleep
|
||||
assert freed_bytes >= 0, "Memory usage increased after sleeping."
|
||||
logger.info(
|
||||
"Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
|
||||
"Sleep mode freed %s GiB memory, %s GiB memory is still in use.",
|
||||
format_gib(freed_bytes),
|
||||
format_gib(used_bytes),
|
||||
)
|
||||
@@ -342,19 +342,19 @@ class Worker(WorkerBase):
|
||||
|
||||
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
|
||||
logger.debug(
|
||||
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
|
||||
"Initial free memory: %s GiB; Requested memory: %f (util), %s GiB",
|
||||
format_gib(self.init_snapshot.free_memory),
|
||||
self.cache_config.gpu_memory_utilization,
|
||||
format_gib(self.requested_memory),
|
||||
)
|
||||
logger.debug(
|
||||
"Free memory after profiling: %f GiB (total), %f GiB (within requested)",
|
||||
"Free memory after profiling: %s GiB (total), %s GiB (within requested)",
|
||||
format_gib(free_gpu_memory),
|
||||
format_gib(free_gpu_memory - unrequested_memory),
|
||||
)
|
||||
logger.debug(profile_result)
|
||||
logger.info_once(
|
||||
"Available KV cache memory: %f GiB",
|
||||
"Available KV cache memory: %s GiB",
|
||||
format_gib(self.available_kv_cache_memory_bytes),
|
||||
scope="local",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user