[Log] add log about gpu worker init snapshot and requested memory (#29493)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
Ning Xie
2026-01-07 01:32:55 +08:00
committed by GitHub
parent 22dffca982
commit 6f5e653383
3 changed files with 59 additions and 42 deletions

View File

@@ -14,6 +14,10 @@ import torch.types
from .mem_constants import GiB_bytes from .mem_constants import GiB_bytes
def format_gib(b: int) -> float:
return round(b / GiB_bytes, 2)
@cache @cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int: def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes.""" """Returns the maximum shared memory per thread block in bytes."""
@@ -146,6 +150,18 @@ class MemorySnapshot:
auto_measure=False, auto_measure=False,
) )
def __repr__(self) -> str:
return (
f"torch_peak={format_gib(self.torch_peak)}GiB, "
f"free_memory={format_gib(self.free_memory)}GiB, "
f"total_memory={format_gib(self.total_memory)}GiB, "
f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
f"torch_memory={format_gib(self.torch_memory)}GiB, "
f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
f"timestamp={self.timestamp}, "
f"auto_measure={self.auto_measure}"
)
@dataclass @dataclass
class MemoryProfilingResult: class MemoryProfilingResult:
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
return ( return (
f"Memory profiling takes {self.profile_time:.2f} seconds. " f"Memory profiling takes {self.profile_time:.2f} seconds. "
f"Total non KV cache memory: " f"Total non KV cache memory: "
f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " f"{format_gib(self.non_kv_cache_memory)}GiB; "
f"torch peak memory increase: " f"torch peak memory increase: "
f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " f"{format_gib(self.torch_peak_increase)}GiB; "
f"non-torch forward increase memory: " f"non-torch forward increase memory: "
f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " f"{format_gib(self.non_torch_increase)}GiB; "
f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB." f"weights memory: {format_gib(self.weights_memory)}GiB."
) )

View File

@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
used_bytes = total - free_bytes_after_sleep used_bytes = total - free_bytes_after_sleep
assert freed_bytes >= 0, "Memory usage increased after sleeping." assert freed_bytes >= 0, "Memory usage increased after sleeping."
logger.info( logger.info(
"Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.", "Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
freed_bytes / GiB_bytes, format_gib(freed_bytes),
used_bytes / GiB_bytes, format_gib(used_bytes),
) )
def wake_up(self, tags: list[str] | None = None) -> None: def wake_up(self, tags: list[str] | None = None) -> None:
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
# take current memory snapshot # take current memory snapshot
self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
self.requested_memory = request_memory(init_snapshot, self.cache_config) self.requested_memory = request_memory(init_snapshot, self.cache_config)
logger.debug("worker init memory snapshot: %r", self.init_snapshot)
logger.debug(
"worker requested memory: %sGiB", format_gib(self.requested_memory)
)
else: else:
raise RuntimeError(f"Not support device type: {self.device_config.device}") raise RuntimeError(f"Not support device type: {self.device_config.device}")
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
""" """
GiB = lambda b: b / GiB_bytes
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
# still need a profile run which compiles the model for # still need a profile run which compiles the model for
# max_num_batched_tokens # max_num_batched_tokens
self.model_runner.profile_run() self.model_runner.profile_run()
msg = ( msg = (
f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} " f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for " f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
"KV Cache as specified by kv_cache_memory_bytes config and " "KV Cache as specified by kv_cache_memory_bytes config and "
"skipped memory profiling. This does not respect the " "skipped memory profiling. This does not respect the "
"gpu_memory_utilization config. Only use kv_cache_memory_bytes " "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
# GPU did not change their memory usage during the profiling. # GPU did not change their memory usage during the profiling.
assert self.init_snapshot.free_memory > free_gpu_memory, ( assert self.init_snapshot.free_memory > free_gpu_memory, (
"Error in memory profiling. " "Error in memory profiling. "
f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, " f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
f"current free memory {GiB(free_gpu_memory)} GiB. " f"current free memory {format_gib(free_gpu_memory)} GiB. "
"This happens when other processes sharing the same container " "This happens when other processes sharing the same container "
"release GPU memory while vLLM is profiling during initialization. " "release GPU memory while vLLM is profiling during initialization. "
"To fix this, ensure consistent GPU memory allocation or " "To fix this, ensure consistent GPU memory allocation or "
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
logger.debug( logger.debug(
"Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB", "Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
GiB(self.init_snapshot.free_memory), format_gib(self.init_snapshot.free_memory),
self.cache_config.gpu_memory_utilization, self.cache_config.gpu_memory_utilization,
GiB(self.requested_memory), format_gib(self.requested_memory),
) )
logger.debug( logger.debug(
"Free memory after profiling: %.2f GiB (total), " "Free memory after profiling: %f GiB (total), %f GiB (within requested)",
"%.2f GiB (within requested)", format_gib(free_gpu_memory),
GiB(free_gpu_memory), format_gib(free_gpu_memory - unrequested_memory),
GiB(free_gpu_memory - unrequested_memory),
) )
logger.debug(profile_result) logger.debug(profile_result)
logger.info_once( logger.info_once(
"Available KV cache memory: %.2f GiB", "Available KV cache memory: %f GiB",
GiB(self.available_kv_cache_memory_bytes), format_gib(self.available_kv_cache_memory_bytes),
scope="local", scope="local",
) )
gc.collect() gc.collect()
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
# CUDAGraph memory size and may not utilize all gpu memory. # CUDAGraph memory size and may not utilize all gpu memory.
# Users may want fine-grained control to specify kv cache # Users may want fine-grained control to specify kv cache
# memory size. # memory size.
GiB = lambda b: round(b / GiB_bytes, 2)
# empirically observed that the memory profiling may # empirically observed that the memory profiling may
# slightly underestimate the memory consumption. # slightly underestimate the memory consumption.
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
msg = ( msg = (
f"Free memory on device " f"Free memory on device "
f"({GiB(self.init_snapshot.free_memory)}/" f"({format_gib(self.init_snapshot.free_memory)}/"
f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. " f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
f"Desired GPU memory utilization is " f"Desired GPU memory utilization is "
f"({self.cache_config.gpu_memory_utilization}, " f"({self.cache_config.gpu_memory_utilization}, "
f"{GiB(self.requested_memory)} GiB). " f"{format_gib(self.requested_memory)} GiB). "
f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
f"for peak activation, {GiB(self.non_torch_memory)} GiB " f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
f"config with `--kv-cache-memory=" f"config with `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_requested_limit}` " f"{kv_cache_memory_bytes_to_requested_limit}` "
f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit " f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
f"into requested memory, or `--kv-cache-memory=" f"into requested memory, or `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_gpu_limit}` " f"{kv_cache_memory_bytes_to_gpu_limit}` "
f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
f"utilize gpu memory. Current kv cache memory in use is " f"utilize gpu memory. Current kv cache memory in use is "
f"{GiB(self.available_kv_cache_memory_bytes)} GiB." f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
) )
logger.debug(msg) logger.debug(msg)

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.cache import processor_only_cache_from_config
from vllm.multimodal.registry import MultiModalRegistry from vllm.multimodal.registry import MultiModalRegistry
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, format_gib
from vllm.utils.mem_utils import MemorySnapshot
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
return placeholders[is_embed] return placeholders[is_embed]
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float: def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
""" """
Calculate the amount of memory required by vLLM, then validate Calculate the amount of memory required by vLLM, then validate
that the current amount of free memory is sufficient for that. that the current amount of free memory is sufficient for that.
""" """
requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization requested_memory = math.ceil(
init_snapshot.total_memory * cache_config.gpu_memory_utilization
)
if init_snapshot.free_memory < requested_memory: if init_snapshot.free_memory < requested_memory:
GiB = lambda b: round(b / GiB_bytes, 2)
raise ValueError( raise ValueError(
f"Free memory on device {init_snapshot.device_} " f"Free memory on device {init_snapshot.device_} "
f"({GiB(init_snapshot.free_memory)}/" f"({format_gib(init_snapshot.free_memory)}/"
f"{GiB(init_snapshot.total_memory)} GiB) on startup " f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
f"is less than desired GPU memory utilization " f"is less than desired GPU memory utilization "
f"({cache_config.gpu_memory_utilization}, " f"({cache_config.gpu_memory_utilization}, "
f"{GiB(requested_memory)} GiB). Decrease GPU memory " f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
f"utilization or reduce GPU memory used by other processes." f"utilization or reduce GPU memory used by other processes."
) )