[Log] add log about gpu worker init snapshot and requested memory (#29493)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
Ning Xie
2026-01-07 01:32:55 +08:00
committed by GitHub
parent 22dffca982
commit 6f5e653383
3 changed files with 59 additions and 42 deletions

View File

@@ -14,6 +14,10 @@ import torch.types
from .mem_constants import GiB_bytes
def format_gib(b: int) -> float:
return round(b / GiB_bytes, 2)
@cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
@@ -146,6 +150,18 @@ class MemorySnapshot:
auto_measure=False,
)
def __repr__(self) -> str:
return (
f"torch_peak={format_gib(self.torch_peak)}GiB, "
f"free_memory={format_gib(self.free_memory)}GiB, "
f"total_memory={format_gib(self.total_memory)}GiB, "
f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
f"torch_memory={format_gib(self.torch_memory)}GiB, "
f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
f"timestamp={self.timestamp}, "
f"auto_measure={self.auto_measure}"
)
@dataclass
class MemoryProfilingResult:
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
return (
f"Memory profiling takes {self.profile_time:.2f} seconds. "
f"Total non KV cache memory: "
f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
f"{format_gib(self.non_kv_cache_memory)}GiB; "
f"torch peak memory increase: "
f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
f"{format_gib(self.torch_peak_increase)}GiB; "
f"non-torch forward increase memory: "
f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
f"{format_gib(self.non_torch_increase)}GiB; "
f"weights memory: {format_gib(self.weights_memory)}GiB."
)

View File

@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
used_bytes = total - free_bytes_after_sleep
assert freed_bytes >= 0, "Memory usage increased after sleeping."
logger.info(
"Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
freed_bytes / GiB_bytes,
used_bytes / GiB_bytes,
"Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
format_gib(freed_bytes),
format_gib(used_bytes),
)
def wake_up(self, tags: list[str] | None = None) -> None:
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
# take current memory snapshot
self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
self.requested_memory = request_memory(init_snapshot, self.cache_config)
logger.debug("worker init memory snapshot: %r", self.init_snapshot)
logger.debug(
"worker requested memory: %sGiB", format_gib(self.requested_memory)
)
else:
raise RuntimeError(f"Not support device type: {self.device_config.device}")
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
GiB = lambda b: b / GiB_bytes
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
# still need a profile run which compiles the model for
# max_num_batched_tokens
self.model_runner.profile_run()
msg = (
f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
"KV Cache as specified by kv_cache_memory_bytes config and "
"skipped memory profiling. This does not respect the "
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
# GPU did not change their memory usage during the profiling.
assert self.init_snapshot.free_memory > free_gpu_memory, (
"Error in memory profiling. "
f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
f"current free memory {GiB(free_gpu_memory)} GiB. "
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
f"current free memory {format_gib(free_gpu_memory)} GiB. "
"This happens when other processes sharing the same container "
"release GPU memory while vLLM is profiling during initialization. "
"To fix this, ensure consistent GPU memory allocation or "
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
logger.debug(
"Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
GiB(self.init_snapshot.free_memory),
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
format_gib(self.init_snapshot.free_memory),
self.cache_config.gpu_memory_utilization,
GiB(self.requested_memory),
format_gib(self.requested_memory),
)
logger.debug(
"Free memory after profiling: %.2f GiB (total), "
"%.2f GiB (within requested)",
GiB(free_gpu_memory),
GiB(free_gpu_memory - unrequested_memory),
"Free memory after profiling: %f GiB (total), %f GiB (within requested)",
format_gib(free_gpu_memory),
format_gib(free_gpu_memory - unrequested_memory),
)
logger.debug(profile_result)
logger.info_once(
"Available KV cache memory: %.2f GiB",
GiB(self.available_kv_cache_memory_bytes),
"Available KV cache memory: %f GiB",
format_gib(self.available_kv_cache_memory_bytes),
scope="local",
)
gc.collect()
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
# CUDAGraph memory size and may not utilize all gpu memory.
# Users may want fine-grained control to specify kv cache
# memory size.
GiB = lambda b: round(b / GiB_bytes, 2)
# empirically observed that the memory profiling may
# slightly underestimate the memory consumption.
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
msg = (
f"Free memory on device "
f"({GiB(self.init_snapshot.free_memory)}/"
f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
f"({format_gib(self.init_snapshot.free_memory)}/"
f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
f"Desired GPU memory utilization is "
f"({self.cache_config.gpu_memory_utilization}, "
f"{GiB(self.requested_memory)} GiB). "
f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
f"for peak activation, {GiB(self.non_torch_memory)} GiB "
f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
f"{format_gib(self.requested_memory)} GiB). "
f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
f"config with `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_requested_limit}` "
f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
f"into requested memory, or `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_gpu_limit}` "
f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
f"utilize gpu memory. Current kv cache memory in use is "
f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
)
logger.debug(msg)

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections import defaultdict
from dataclasses import dataclass, field
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config
from vllm.multimodal.registry import MultiModalRegistry
from vllm.platforms import current_platform
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import MemorySnapshot
from vllm.utils.mem_utils import MemorySnapshot, format_gib
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
return placeholders[is_embed]
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
"""
Calculate the amount of memory required by vLLM, then validate
that the current amount of free memory is sufficient for that.
"""
requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
requested_memory = math.ceil(
init_snapshot.total_memory * cache_config.gpu_memory_utilization
)
if init_snapshot.free_memory < requested_memory:
GiB = lambda b: round(b / GiB_bytes, 2)
raise ValueError(
f"Free memory on device {init_snapshot.device_} "
f"({GiB(init_snapshot.free_memory)}/"
f"{GiB(init_snapshot.total_memory)} GiB) on startup "
f"({format_gib(init_snapshot.free_memory)}/"
f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
f"is less than desired GPU memory utilization "
f"({cache_config.gpu_memory_utilization}, "
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
f"utilization or reduce GPU memory used by other processes."
)