[Log] add log about gpu worker init snapshot and requested memory (#29493)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
2026-01-07 01:32:55 +08:00
parent 22dffca982
commit 6f5e653383
3 changed files with 59 additions and 42 deletions
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -14,6 +14,10 @@ import torch.types
 from .mem_constants import GiB_bytes
 def format_gib(b: int) -> float:
    return round(b / GiB_bytes, 2)
@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
@@ -146,6 +150,18 @@ class MemorySnapshot:
            auto_measure=False,
        )
    def __repr__(self) -> str:
        return (
            f"torch_peak={format_gib(self.torch_peak)}GiB, "
            f"free_memory={format_gib(self.free_memory)}GiB, "
            f"total_memory={format_gib(self.total_memory)}GiB, "
            f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
            f"torch_memory={format_gib(self.torch_memory)}GiB, "
            f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
            f"timestamp={self.timestamp}, "
            f"auto_measure={self.auto_measure}"
        )
@dataclass
 class MemoryProfilingResult:
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
        return (
            f"Memory profiling takes {self.profile_time:.2f} seconds. "
            f"Total non KV cache memory: "
-            f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.non_kv_cache_memory)}GiB; "
            f"torch peak memory increase: "
-            f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.torch_peak_increase)}GiB; "
            f"non-torch forward increase memory: "
-            f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.non_torch_increase)}GiB; "
-            f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
+            f"weights memory: {format_gib(self.weights_memory)}GiB."
        )
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
 from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
        used_bytes = total - free_bytes_after_sleep
        assert freed_bytes >= 0, "Memory usage increased after sleeping."
        logger.info(
-            "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
+            "Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
-            freed_bytes / GiB_bytes,
+            format_gib(freed_bytes),
-            used_bytes / GiB_bytes,
+            format_gib(used_bytes),
        )
    def wake_up(self, tags: list[str] | None = None) -> None:
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
            # take current memory snapshot
            self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
            self.requested_memory = request_memory(init_snapshot, self.cache_config)
            logger.debug("worker init memory snapshot: %r", self.init_snapshot)
            logger.debug(
                "worker requested memory: %sGiB", format_gib(self.requested_memory)
            )
        else:
            raise RuntimeError(f"Not support device type: {self.device_config.device}")
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
            You may limit the usage of GPU memory
            by adjusting the `gpu_memory_utilization` parameter.
        """
        GiB = lambda b: b / GiB_bytes
        if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
            # still need a profile run which compiles the model for
            # max_num_batched_tokens
            self.model_runner.profile_run()
            msg = (
-                f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
+                f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
-                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
+                f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
                "KV Cache as specified by kv_cache_memory_bytes config and "
                "skipped memory profiling. This does not respect the "
                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
        # GPU did not change their memory usage during the profiling.
        assert self.init_snapshot.free_memory > free_gpu_memory, (
            "Error in memory profiling. "
-            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
+            f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
-            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            f"current free memory {format_gib(free_gpu_memory)} GiB. "
            "This happens when other processes sharing the same container "
            "release GPU memory while vLLM is profiling during initialization. "
            "To fix this, ensure consistent GPU memory allocation or "
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
        unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
        logger.debug(
-            "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
+            "Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
-            GiB(self.init_snapshot.free_memory),
+            format_gib(self.init_snapshot.free_memory),
            self.cache_config.gpu_memory_utilization,
-            GiB(self.requested_memory),
+            format_gib(self.requested_memory),
        )
        logger.debug(
-            "Free memory after profiling: %.2f GiB (total), "
+            "Free memory after profiling: %f GiB (total), %f GiB (within requested)",
-            "%.2f GiB (within requested)",
+            format_gib(free_gpu_memory),
-            GiB(free_gpu_memory),
+            format_gib(free_gpu_memory - unrequested_memory),
            GiB(free_gpu_memory - unrequested_memory),
        )
        logger.debug(profile_result)
        logger.info_once(
-            "Available KV cache memory: %.2f GiB",
+            "Available KV cache memory: %f GiB",
-            GiB(self.available_kv_cache_memory_bytes),
+            format_gib(self.available_kv_cache_memory_bytes),
            scope="local",
        )
        gc.collect()
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
            # CUDAGraph memory size and may not utilize all gpu memory.
            # Users may want fine-grained control to specify kv cache
            # memory size.
            GiB = lambda b: round(b / GiB_bytes, 2)
            # empirically observed that the memory profiling may
            # slightly underestimate the memory consumption.
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
            msg = (
                f"Free memory on device "
-                f"({GiB(self.init_snapshot.free_memory)}/"
+                f"({format_gib(self.init_snapshot.free_memory)}/"
-                f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
+                f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
                f"Desired GPU memory utilization is "
                f"({self.cache_config.gpu_memory_utilization}, "
-                f"{GiB(self.requested_memory)} GiB). "
+                f"{format_gib(self.requested_memory)} GiB). "
-                f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
+                f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
-                f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
+                f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
-                f"for peak activation, {GiB(self.non_torch_memory)} GiB "
+                f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
-                f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
+                f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
                f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
                f"config with `--kv-cache-memory="
                f"{kv_cache_memory_bytes_to_requested_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
+                f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
                f"into requested memory, or `--kv-cache-memory="
                f"{kv_cache_memory_bytes_to_gpu_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
+                f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
                f"utilize gpu memory. Current kv cache memory in use is "
-                f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
+                f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
            )
            logger.debug(msg)
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
-from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import MemorySnapshot, format_gib
 from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
    return placeholders[is_embed]
-def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
+def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
    """
    Calculate the amount of memory required by vLLM, then validate
    that the current amount of free memory is sufficient for that.
    """
-    requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
+    requested_memory = math.ceil(
        init_snapshot.total_memory * cache_config.gpu_memory_utilization
    )
    if init_snapshot.free_memory < requested_memory:
        GiB = lambda b: round(b / GiB_bytes, 2)
        raise ValueError(
            f"Free memory on device {init_snapshot.device_} "
-            f"({GiB(init_snapshot.free_memory)}/"
+            f"({format_gib(init_snapshot.free_memory)}/"
-            f"{GiB(init_snapshot.total_memory)} GiB) on startup "
+            f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
            f"is less than desired GPU memory utilization "
            f"({cache_config.gpu_memory_utilization}, "
-            f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+            f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
            f"utilization or reduce GPU memory used by other processes."
        )